From e4d609cf1b28df72afeb8828744085785eca9e76 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Mon, 22 Dec 2025 15:15:32 +0000 Subject: [PATCH 01/18] Add new encoding tests and implementation --- include/cpp/encoding/ASCII.hpp | 24 +++ include/cpp/encoding/UTF16.hpp | 21 +++ include/cpp/encoding/UTF8.hpp | 35 ++++ include/cpp/marshal/View.hpp | 2 +- src/cpp/encoding/ASCII.cpp | 66 +++++++ src/cpp/encoding/UTF16.cpp | 201 ++++++++++++++++++++++ src/cpp/encoding/UTF8.cpp | 219 ++++++++++++++++++++++++ test/native/Native.hx | 6 +- test/native/tests/encoding/TestAscii.hx | 86 ++++++++++ test/native/tests/encoding/TestUtf16.hx | 180 +++++++++++++++++++ test/native/tests/encoding/TestUtf8.hx | 157 +++++++++++++++++ toolchain/haxe-target.xml | 7 + 12 files changed, 1002 insertions(+), 2 deletions(-) create mode 100644 include/cpp/encoding/ASCII.hpp create mode 100644 include/cpp/encoding/UTF16.hpp create mode 100644 include/cpp/encoding/UTF8.hpp create mode 100644 src/cpp/encoding/ASCII.cpp create mode 100644 src/cpp/encoding/UTF16.cpp create mode 100644 src/cpp/encoding/UTF8.cpp create mode 100644 test/native/tests/encoding/TestAscii.hx create mode 100644 test/native/tests/encoding/TestUtf16.hx create mode 100644 test/native/tests/encoding/TestUtf8.hx diff --git a/include/cpp/encoding/ASCII.hpp b/include/cpp/encoding/ASCII.hpp new file mode 100644 index 000000000..0d89fa63c --- /dev/null +++ b/include/cpp/encoding/ASCII.hpp @@ -0,0 +1,24 @@ +#pragma once + +namespace cpp +{ + namespace encoding + { + struct Ascii final + { + static bool isEncoded(const String& string); + + /// + /// Encode the provided string to ASCII bytes and write them to the buffer. + /// If the provided string is UTF16 encoded an exception is raised and nothing is written to the buffer. + /// + /// Number of chars written to the buffer. + static int64_t encode(const String& string, cpp::marshal::View buffer); + + /// + /// Create a string from the provided ASCII bytes. + /// + static String decode(cpp::marshal::View string); + }; + } +} \ No newline at end of file diff --git a/include/cpp/encoding/UTF16.hpp b/include/cpp/encoding/UTF16.hpp new file mode 100644 index 000000000..9bb3bcc36 --- /dev/null +++ b/include/cpp/encoding/UTF16.hpp @@ -0,0 +1,21 @@ +#pragma once + +namespace cpp +{ + namespace encoding + { + struct Utf16 final + { + static bool isEncoded(const String& string); + + static int32_t getByteCount(const char32_t& codepoint); + static int64_t getByteCount(const String& string); + + static int64_t encode(const String& string, cpp::marshal::View buffer); + static int64_t encode(const char32_t& codepoint, cpp::marshal::View buffer); + + static String decode(cpp::marshal::View buffer); + static int64_t decode(cpp::marshal::View buffer, char32_t& out); + }; + } +} \ No newline at end of file diff --git a/include/cpp/encoding/UTF8.hpp b/include/cpp/encoding/UTF8.hpp new file mode 100644 index 000000000..db06a9251 --- /dev/null +++ b/include/cpp/encoding/UTF8.hpp @@ -0,0 +1,35 @@ +#pragma once + +namespace cpp +{ + namespace encoding + { + struct Utf8 final + { + /// + /// Returns the number of bytes required to store the codepoint in it's UTF8 form. + /// + static int64_t getByteCount(const char32_t& codepoint); + + /// + /// Returns the number of bytes required to store the string in it's UTF8 form. + /// + static int64_t getByteCount(const String& string); + + /// + /// Writes the provided string in it's UTF8 form to the buffer. + /// + /// Number of byte written into the buffer + static int64_t encode(const String& string, cpp::marshal::View buffer); + + /// + /// Writes the provided code point in it's UTF8 form to the buffer. + /// + /// Number of byte written into the buffer + static int64_t encode(const char32_t& codepoint, cpp::marshal::View buffer); + + static String decode(cpp::marshal::View buffer); + static int64_t decode(cpp::marshal::View buffer, char32_t& out); + }; + } +} \ No newline at end of file diff --git a/include/cpp/marshal/View.hpp b/include/cpp/marshal/View.hpp index 6b2ad2d1f..f62e6ad07 100644 --- a/include/cpp/marshal/View.hpp +++ b/include/cpp/marshal/View.hpp @@ -67,7 +67,7 @@ template template inline cpp::marshal::View cpp::marshal::View::reinterpret() { - auto newPtr = ::cpp::Pointer{ ptr.reinterpret() }; + auto newPtr = ::cpp::Pointer(reinterpret_cast(ptr.ptr)); auto fromSize = sizeof(T); auto toSize = sizeof(K); diff --git a/src/cpp/encoding/ASCII.cpp b/src/cpp/encoding/ASCII.cpp new file mode 100644 index 000000000..3115e19cd --- /dev/null +++ b/src/cpp/encoding/ASCII.cpp @@ -0,0 +1,66 @@ +#include + +using namespace cpp::marshal; + +bool cpp::encoding::Ascii::isEncoded(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + return string.isAsciiEncoded(); +} + +int64_t cpp::encoding::Ascii::encode(const String& string, View buffer) +{ + if (hx::IsNull(string)) + { + hx::NullReference("String", false); + } + + if (string.isUTF16Encoded()) + { + hx::Throw(HX_CSTRING("String cannot be encoded to ASCII")); + } + + auto src = cpp::marshal::View(string.raw_ptr(), string.length).reinterpret(); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } +} + +String cpp::encoding::Ascii::decode(View view) +{ + if (view.isEmpty()) + { + return hx::Throw(HX_CSTRING("View is empty")); + } + + auto bytes = int64_t{ 0 }; + auto i = int64_t{ 0 }; + auto chars = view.reinterpret(); + + while (i < chars.length && 0 != chars.ptr[i]) + { + bytes += sizeof(char); + i++; + } + + if (0 == bytes) + { + return String::emptyString; + } + + auto backing = hx::NewGCPrivate(0, bytes + sizeof(char)); + + std::memcpy(backing, view.ptr.ptr, bytes); + + return String(static_cast(backing), bytes / sizeof(char)); +} diff --git a/src/cpp/encoding/UTF16.cpp b/src/cpp/encoding/UTF16.cpp new file mode 100644 index 000000000..743629301 --- /dev/null +++ b/src/cpp/encoding/UTF16.cpp @@ -0,0 +1,201 @@ +#include +#include + +using namespace cpp::marshal; + +namespace +{ + bool isSurrogate(char32_t codepoint) + { + return codepoint >= 0xd800 && codepoint < 0xe000; + } +} + +bool cpp::encoding::Utf16::isEncoded(const String& string) +{ + if (hx::IsNull(string)) + { + hx::NullReference("String", false); + } + + return string.isUTF16Encoded(); +} + +int32_t cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) +{ + if (codepoint >= 0x10000) + { + if (codepoint < 0x110000) + { + return 4; + } + } + + return 2; +} + +int64_t cpp::encoding::Utf16::getByteCount(const String& string) +{ + if (hx::IsNull(string)) + { + hx::NullReference("String", false); + } + + if (string.isUTF16Encoded()) + { + return string.length * sizeof(char16_t); + } + else + { + auto bytes = int64_t{ 0 }; + for (auto i = 0; i < string.length; i++) + { + bytes += getByteCount(static_cast(string.raw_ptr()[i])); + } + + return bytes; + } +} + +int64_t cpp::encoding::Utf16::encode(const String& string, cpp::marshal::View buffer) +{ + if (hx::IsNull(string)) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + + if (string.isUTF16Encoded()) + { + auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_wptr())), string.length * sizeof(char16_t)); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + } + else + { + auto bytes = int64_t{ 0 }; + for (auto i = 0; i < string.length; i++) + { + bytes += getByteCount(static_cast(string.raw_ptr()[i])); + } + + if (bytes > buffer.length) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + + for (auto i = 0; i < string.length; i++) + { + buffer = buffer.slice(encode(static_cast(string.raw_ptr()[i]), buffer)); + } + + return bytes; + } +} + +int64_t cpp::encoding::Utf16::encode(const char32_t& codepoint, cpp::marshal::View buffer) +{ + if (codepoint >= 0x10000) + { + auto over = codepoint - 0x10000; + if (over >= 0x10000) + { + Marshal::writeUInt16(buffer, 0xFFFD); + + return 2; + } + else + { + auto staging = std::array(); + staging[0] = (over >> 10) + 0xD800; + staging[1] = (over & 0x3FF) + 0xDC00; + + Marshal::writeUInt32(buffer, *reinterpret_cast(staging.data())); + + return 4; + } + } + else if (isSurrogate(codepoint)) + { + Marshal::writeUInt16(buffer, 0xFFFD); + + return 2; + } + else + { + Marshal::writeUInt16(buffer, static_cast(codepoint)); + + return 2; + } +} + +String cpp::encoding::Utf16::decode(cpp::marshal::View buffer) +{ + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("View empty")); + } + + auto bytes = int64_t{ 0 }; + auto codepoint = char32_t{ 0 }; + auto i = int64_t{ 0 }; + + while (i < buffer.length) + { + i += decode(buffer.slice(i), codepoint); + bytes += getByteCount(codepoint); + } + + auto backing = static_cast(hx::NewGCPrivate(0, bytes + sizeof(char16_t))); + auto output = View(backing, bytes); + + while (false == buffer.isEmpty()) + { + buffer = buffer.slice(decode(buffer, codepoint)); + output = output.slice(encode(codepoint, output)); + } + + reinterpret_cast(backing)[-1] |= HX_GC_STRING_CHAR16_T; + + return String(reinterpret_cast(backing), bytes / sizeof(char16_t)); +} + +int64_t cpp::encoding::Utf16::decode(cpp::marshal::View buffer, char32_t& codepoint) +{ + auto first = static_cast(Marshal::readUInt16(buffer)); + + if (0xD800 <= first && first < 0xDc00) + { + auto second = static_cast(Marshal::readUInt16(buffer.slice(2))); + if (0xDC00 <= second && second < 0xE000) + { + codepoint = ((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); + + return 4; + } + + return hx::Throw(HX_CSTRING("Invalid UTF16")); + } + else + { + codepoint = first; + + return 2; + } +} diff --git a/src/cpp/encoding/UTF8.cpp b/src/cpp/encoding/UTF8.cpp new file mode 100644 index 000000000..dc13c5c27 --- /dev/null +++ b/src/cpp/encoding/UTF8.cpp @@ -0,0 +1,219 @@ +#include + +using namespace cpp::marshal; + +int64_t cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) +{ + if (codepoint <= 0x7F) + { + return 1; + } + else if (codepoint <= 0x7FF) + { + return 2; + } + else if (codepoint <= 0xFFFF) + { + return 3; + } + else + { + return 4; + } +} + +int64_t cpp::encoding::Utf8::getByteCount(const String& string) +{ + if (hx::IsNull(string)) + { + hx::NullReference("String", false); + } + + if (string.isAsciiEncoded()) + { + return string.length; + } + +#if defined(HX_SMART_STRINGS) + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto length = source.length; + auto codepoint = char32_t{ 0 }; + auto bytes = int64_t{ 0 }; + + while (false == source.isEmpty()) + { + source = source.slice(Utf16::decode(source, codepoint)); + bytes += getByteCount(codepoint); + } + + return bytes; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + +int64_t cpp::encoding::Utf8::encode(const String& string, cpp::marshal::View buffer) +{ + if (hx::IsNull(string)) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + + if (string.isAsciiEncoded()) + { + auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_ptr())), string.length * sizeof(char)); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + } + +#if defined(HX_SMART_STRINGS) + if (getByteCount(string) > buffer.length) + { + hx::Throw(HX_CSTRING("Buffer too small")); + } + + auto initialPtr = buffer.ptr.ptr; + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto codepoint = char32_t{ 0 }; + + while (false == source.isEmpty()) + { + source = source.slice(Utf16::decode(source, codepoint)); + buffer = buffer.slice(encode(codepoint, buffer)); + } + + return buffer.ptr.ptr - initialPtr; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + +int64_t cpp::encoding::Utf8::encode(const char32_t& codepoint, cpp::marshal::View buffer) +{ + if (getByteCount(codepoint) > buffer.length) + { + hx::Throw(HX_CSTRING("Buffer too small")); + } + + if (codepoint <= 0x7F) + { + buffer.ptr[0] = codepoint; + + return 1; + } + else if (codepoint <= 0x7FF) + { + buffer.ptr[0] = (0xC0 | (codepoint >> 6)); + buffer.ptr[1] = (0x80 | (codepoint & 63)); + + return 2; + } + else if (codepoint <= 0xFFFF) + { + buffer.ptr[0] = (0xE0 | (codepoint >> 12)); + buffer.ptr[1] = (0x80 | ((codepoint >> 6) & 63)); + buffer.ptr[2] = (0x80 | (codepoint & 63)); + + return 3; + } + else + { + buffer.ptr[0] = (0xF0 | (codepoint >> 18)); + buffer.ptr[1] = (0x80 | ((codepoint >> 12) & 63)); + buffer.ptr[2] = (0x80 | ((codepoint >> 6) & 63)); + buffer.ptr[3] = (0x80 | (codepoint & 63)); + + return 4; + } +} + +String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) +{ + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("View empty")); + } + + auto bytes = int64_t{ 0 }; + auto codepoint = char32_t{ 0 }; + auto i = int64_t{ 0 }; + + while (i < buffer.length) + { + i += decode(buffer.slice(i), codepoint); + bytes += Utf16::getByteCount(codepoint); + } + + auto backing = static_cast(hx::NewGCPrivate(0, bytes + sizeof(char16_t))); + auto output = View(backing, bytes); + + while (false == buffer.isEmpty()) + { + buffer = buffer.slice(decode(buffer, codepoint)); + output = output.slice(Utf16::encode(codepoint, output)); + } + + reinterpret_cast(backing)[-1] |= HX_GC_STRING_CHAR16_T; + + return String(reinterpret_cast(backing), bytes / sizeof(char16_t)); +} + +int64_t cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& codepoint) +{ + if (0 == buffer.length) + { + return hx::Throw(HX_CSTRING("Empty view")); + } + + auto b0 = buffer[0]; + + if ((b0 & 0x80) == 0) + { + codepoint = b0; + + return 1; + } + else if ((b0 & 0xE0) == 0xC0) + { + codepoint = (static_cast(b0 & 0x1F) << 6) | static_cast(buffer[1] & 0x3F); + + return 2; + } + else if ((b0 & 0xF0) == 0xE0) + { + codepoint = (static_cast(b0 & 0x0F) << 12) | (static_cast(buffer[1] & 0x3F) << 6) | static_cast(buffer[2] & 0x3F); + + return 3; + } + else if ((b0 & 0xF8) == 0xF0) + { + codepoint = + (static_cast(b0 & 0x07) << 18) | + (static_cast(buffer[1] & 0x3F) << 12) | + (static_cast(buffer[2] & 0x3F) << 6) | + static_cast(buffer[3] & 0x3F); + + return 4; + } + else + { + return hx::Throw(HX_CSTRING("Failed to read codepoint")); + } +} \ No newline at end of file diff --git a/test/native/Native.hx b/test/native/Native.hx index 0c8cb9506..d82f41a19 100644 --- a/test/native/Native.hx +++ b/test/native/Native.hx @@ -45,7 +45,11 @@ class Native new tests.marshalling.view.TestView(), new tests.marshalling.view.TestMarshal(), - new tests.marshalling.view.TestViewExtensions() + new tests.marshalling.view.TestViewExtensions(), + + new tests.encoding.TestAscii(), + new tests.encoding.TestUtf8(), + new tests.encoding.TestUtf16(), #end ]); } diff --git a/test/native/tests/encoding/TestAscii.hx b/test/native/tests/encoding/TestAscii.hx new file mode 100644 index 000000000..08d6b9df4 --- /dev/null +++ b/test/native/tests/encoding/TestAscii.hx @@ -0,0 +1,86 @@ +package tests.encoding; + +import haxe.io.Bytes; +import cpp.encoding.Ascii; +import utest.Assert; +import utest.Test; + +using cpp.marshal.ViewExtensions; + +class TestAscii extends Test +{ + function test_isEncoded_null() { + Assert.raises(() -> Ascii.isEncoded(null)); + } + + function test_isEncoded_ascii() { + Assert.isTrue(Ascii.isEncoded("test")); + } + + function test_isEncoded_utf16() { + Assert.isFalse(Ascii.isEncoded("πŸ˜‚")); + } + + function test_encode_null() { + final buffer = Bytes.alloc(4); + + Assert.raises(() -> Ascii.encode(null, buffer.asView())); + } + + function test_encode_small_buffer() { + final buffer = Bytes.alloc(2); + + Assert.raises(() -> Ascii.encode("test", buffer.asView())); + } + + function test_encode_utf16() { + final buffer = Bytes.alloc(1024); + + Assert.raises(() -> Ascii.encode("πŸ˜‚", buffer.asView())); + } + + function test_encode() { + final buffer = Bytes.alloc(1024); + + Assert.equals(4i64, Ascii.encode("test", buffer.asView())); + Assert.equals('t'.code, buffer.get(0)); + Assert.equals('e'.code, buffer.get(1)); + Assert.equals('s'.code, buffer.get(2)); + Assert.equals('t'.code, buffer.get(3)); + } + + function test_decode_empty() { + Assert.raises(() -> Ascii.decode(ViewExtensions.empty())); + } + + function test_decode() { + final buffer = Bytes.alloc(4); + buffer.set(0, 't'.code); + buffer.set(1, 'e'.code); + buffer.set(2, 's'.code); + buffer.set(3, 't'.code); + + Assert.equals('test', Ascii.decode(buffer.asView())); + } + + function test_decode_null_termination() { + final buffer = Bytes.alloc(9); + buffer.set(0, 't'.code); + buffer.set(1, 'e'.code); + buffer.set(2, 's'.code); + buffer.set(3, 't'.code); + buffer.set(4, 0); + buffer.set(5, 't'.code); + buffer.set(6, 'e'.code); + buffer.set(7, 's'.code); + buffer.set(8, 't'.code); + + Assert.equals('test', Ascii.decode(buffer.asView())); + } + + function test_decode_no_string() { + final buffer = Bytes.alloc(1); + + Assert.equals('', Ascii.decode(buffer.asView())); + } +} \ No newline at end of file diff --git a/test/native/tests/encoding/TestUtf16.hx b/test/native/tests/encoding/TestUtf16.hx new file mode 100644 index 000000000..c0e27589a --- /dev/null +++ b/test/native/tests/encoding/TestUtf16.hx @@ -0,0 +1,180 @@ +package tests.encoding; + +import haxe.io.Bytes; +import cpp.encoding.Utf16; +import utest.Assert; +import utest.Test; + +using cpp.marshal.ViewExtensions; + +class TestUtf16 extends Test { + function test_isEncoded_null() { + Assert.raises(() -> Utf16.isEncoded(null)); + } + + function test_isEncoded_ascii() { + Assert.isFalse(Utf16.isEncoded("test")); + } + + function test_isEncoded_utf16() { + Assert.isTrue(Utf16.isEncoded("πŸ˜‚")); + } + + public function test_getByteCount_codepoint() { + Assert.equals(2i64, Utf16.getByteCount('a'.code)); + Assert.equals(2i64, Utf16.getByteCount('Ζ…'.code)); + Assert.equals(2i64, Utf16.getByteCount('バ'.code)); + Assert.equals(4i64, Utf16.getByteCount('𝄳'.code)); + Assert.equals(4i64, Utf16.getByteCount('πŸ˜‚'.code)); + } + + public function test_getByteCount_string_null() { + Assert.raises(() -> Utf16.getByteCount((null:String))); + } + + public function test_getByteCount_string_empty() { + Assert.equals(0i64, Utf16.getByteCount('')); + } + + public function test_getByteCount_string_ascii() { + Assert.equals(26i64, Utf16.getByteCount('Hello, World!')); + } + + public function test_getByteCount_string_utf16() { + Assert.equals(26i64, Utf16.getByteCount('HelloπŸ˜‚World!')); + } + + public function test_encode_codepoint() { + final buffer = Bytes.alloc(4); + + Assert.equals(2i64, Utf16.encode('a'.code, buffer.asView())); + Assert.equals(0x61, buffer.get(0)); + Assert.equals(0x00, buffer.get(1)); + buffer.asView().clear(); + + Assert.equals(2i64, Utf16.encode('Ζ…'.code, buffer.asView())); + Assert.equals(0x85, buffer.get(0)); + Assert.equals(0x01, buffer.get(1)); + buffer.asView().clear(); + + Assert.equals(2i64, Utf16.encode('バ'.code, buffer.asView())); + Assert.equals(0xD0, buffer.get(0)); + Assert.equals(0x30, buffer.get(1)); + buffer.asView().clear(); + + Assert.equals(4i64, Utf16.encode('𝄳'.code, buffer.asView())); + Assert.equals(0x34, buffer.get(0)); + Assert.equals(0xD8, buffer.get(1)); + Assert.equals(0x33, buffer.get(2)); + Assert.equals(0xDD, buffer.get(3)); + buffer.asView().clear(); + } + + public function test_encode_codepoint_empty_view() { + Assert.raises(() -> Utf16.encode('a'.code, ViewExtensions.empty())); + } + + public function test_encode_codepoint_no_partial_writes() { + final buffer = Bytes.alloc(2); + + Assert.raises(() -> Utf16.encode('𝄳'.code, buffer.asView())); + Assert.equals(0, buffer.get(0)); + Assert.equals(0, buffer.get(1)); + } + + public function test_encode_string_null() { + final buffer = Bytes.alloc(8); + + Assert.raises(() -> Utf16.encode((null:String), buffer.asView())); + } + + public function test_encode_string_empty_view() { + Assert.raises(() -> Utf16.encode('test', ViewExtensions.empty())); + } + + public function test_encode_string_empty_string() { + final buffer = Bytes.alloc(8); + + Assert.equals(0i64, Utf16.encode('', buffer.asView())); + } + + public function test_encode_string_small_buffer() { + final buffer = Bytes.alloc(2); + + Assert.raises(() -> Utf16.encode('test', buffer.asView())); + Assert.equals(0, buffer.get(0)); + Assert.equals(0, buffer.get(1)); + } + + public function test_encode_string_ascii() { + final buffer = Bytes.alloc(8); + + Assert.equals(8i64, Utf16.encode('test', buffer.asView())); + Assert.equals('t'.code, buffer.get(0)); + Assert.equals(0, buffer.get(1)); + Assert.equals('e'.code, buffer.get(2)); + Assert.equals(0, buffer.get(3)); + Assert.equals('s'.code, buffer.get(4)); + Assert.equals(0, buffer.get(5)); + Assert.equals('t'.code, buffer.get(6)); + Assert.equals(0, buffer.get(7)); + } + + public function test_encode_string_utf16() { + final buffer = Bytes.alloc(16); + + Assert.equals(12i64, Utf16.encode('teπŸ˜‚st', buffer.asView())); + Assert.equals('t'.code, buffer.get(0)); + Assert.equals(0, buffer.get(1)); + Assert.equals('e'.code, buffer.get(2)); + Assert.equals(0, buffer.get(3)); + + Assert.equals(0x3D, buffer.get(4)); + Assert.equals(0xD8, buffer.get(5)); + Assert.equals(0x02, buffer.get(6)); + Assert.equals(0xDE, buffer.get(7)); + + Assert.equals('s'.code, buffer.get(8)); + Assert.equals(0, buffer.get(9)); + Assert.equals('t'.code, buffer.get(10)); + Assert.equals(0, buffer.get(11)); + } + + public function test_decode_codepoint() { + var codepoint : cpp.Char32 = 0; + + var bytes = Bytes.ofHex('6100'); + Assert.equals(2i64, Utf16.decode(bytes.asView(), codepoint)); + Assert.equals('a'.code, cast codepoint); + + var bytes = Bytes.ofHex('8501'); + Assert.equals(2i64, Utf16.decode(bytes.asView(), codepoint)); + Assert.equals('Ζ…'.code, cast codepoint); + + var bytes = Bytes.ofHex('D030'); + Assert.equals(2i64, Utf16.decode(bytes.asView(), codepoint)); + Assert.equals('バ'.code, cast codepoint); + + var bytes = Bytes.ofHex('34D833DD'); + Assert.equals(4i64, Utf16.decode(bytes.asView(), codepoint)); + Assert.equals('𝄳'.code, cast codepoint); + } + + public function test_decode_string() { + var bytes = Bytes.ofHex('6100'); + Assert.equals('a', Utf16.decode(bytes.asView())); + + var bytes = Bytes.ofHex('8501'); + Assert.equals('Ζ…', Utf16.decode(bytes.asView())); + + var bytes = Bytes.ofHex('D030'); + Assert.equals('バ', Utf16.decode(bytes.asView())); + + var bytes = Bytes.ofHex('34D833DD'); + Assert.equals('𝄳', Utf16.decode(bytes.asView())); + } + + public function test_decode_empty_view() { + Assert.raises(() -> Utf16.decode(ViewExtensions.empty())); + } +} \ No newline at end of file diff --git a/test/native/tests/encoding/TestUtf8.hx b/test/native/tests/encoding/TestUtf8.hx new file mode 100644 index 000000000..e03f312a9 --- /dev/null +++ b/test/native/tests/encoding/TestUtf8.hx @@ -0,0 +1,157 @@ +package tests.encoding; + +import haxe.io.Bytes; +import cpp.encoding.Utf8; +import utest.Assert; +import utest.Test; + +using cpp.marshal.ViewExtensions; + +class TestUtf8 extends Test { + public function test_getByteCount_codepoint() { + Assert.equals(1i64, Utf8.getByteCount('a'.code)); + Assert.equals(2i64, Utf8.getByteCount('Ζ…'.code)); + Assert.equals(3i64, Utf8.getByteCount('バ'.code)); + Assert.equals(4i64, Utf8.getByteCount('𝄳'.code)); + } + + public function test_getByteCount_string_null() { + Assert.raises(() -> Utf8.getByteCount((null:String))); + } + + public function test_getByteCount_string_empty() { + Assert.equals(0i64, Utf8.getByteCount('')); + } + + public function test_getByteCount_string_ascii() { + Assert.equals(13i64, Utf8.getByteCount('Hello, World!')); + } + + public function test_getByteCount_string_utf16() { + Assert.equals(15i64, Utf8.getByteCount('HelloπŸ˜‚World!')); + } + + public function test_encode_codepoint() { + final buffer = Bytes.alloc(4); + + Assert.equals(1i64, Utf8.encode('a'.code, buffer.asView())); + Assert.equals(0x61, buffer.get(0)); + buffer.asView().clear(); + + Assert.equals(2i64, Utf8.encode('Ζ…'.code, buffer.asView())); + Assert.equals(0xC6, buffer.get(0)); + Assert.equals(0x85, buffer.get(1)); + buffer.asView().clear(); + + Assert.equals(3i64, Utf8.encode('バ'.code, buffer.asView())); + Assert.equals(0xE3, buffer.get(0)); + Assert.equals(0x83, buffer.get(1)); + Assert.equals(0x90, buffer.get(2)); + buffer.asView().clear(); + + Assert.equals(4i64, Utf8.encode('𝄳'.code, buffer.asView())); + Assert.equals(0xF0, buffer.get(0)); + Assert.equals(0x9D, buffer.get(1)); + Assert.equals(0x84, buffer.get(2)); + Assert.equals(0xB3, buffer.get(3)); + buffer.asView().clear(); + } + + public function test_encode_codepoint_empty_view() { + Assert.raises(() -> Utf8.encode('a'.code, ViewExtensions.empty())); + } + + public function test_encode_codepoint_no_partial_writes() { + final buffer = Bytes.alloc(2); + + Assert.raises(() -> Utf8.encode('𝄳'.code, buffer.asView())); + Assert.equals(0, buffer.get(0)); + Assert.equals(0, buffer.get(1)); + } + + public function test_encode_string_null() { + final buffer = Bytes.alloc(8); + + Assert.raises(() -> Utf8.encode((null:String), buffer.asView())); + } + + public function test_encode_string_empty_view() { + Assert.raises(() -> Utf8.encode('test', ViewExtensions.empty())); + } + + public function test_encode_string_empty_string() { + final buffer = Bytes.alloc(8); + + Assert.equals(0i64, Utf8.encode('', buffer.asView())); + } + + public function test_encode_string_small_buffer() { + final buffer = Bytes.alloc(2); + + Assert.raises(() -> Utf8.encode('test', buffer.asView())); + Assert.equals(0, buffer.get(0)); + Assert.equals(0, buffer.get(1)); + } + + public function test_encode_string_ascii() { + final buffer = Bytes.alloc(4); + + Assert.equals(4i64, Utf8.encode('test', buffer.asView())); + Assert.equals('t'.code, buffer.get(0)); + Assert.equals('e'.code, buffer.get(1)); + Assert.equals('s'.code, buffer.get(2)); + Assert.equals('t'.code, buffer.get(3)); + } + + public function test_encode_string_utf16() { + final buffer = Bytes.alloc(8); + + Assert.equals(8i64, Utf8.encode('teπŸ˜‚st', buffer.asView())); + Assert.equals(0x74, buffer.get(0)); + Assert.equals(0x65, buffer.get(1)); + Assert.equals(0xF0, buffer.get(2)); + Assert.equals(0x9F, buffer.get(3)); + Assert.equals(0x98, buffer.get(4)); + Assert.equals(0x82, buffer.get(5)); + Assert.equals(0x73, buffer.get(6)); + Assert.equals(0x74, buffer.get(7)); + } + + public function test_decode_codepoint() { + var codepoint : cpp.Char32 = 0; + + var bytes = Bytes.ofHex('61'); + Assert.equals(1i64, Utf8.decode(bytes.asView(), codepoint)); + Assert.equals('a'.code, cast codepoint); + + var bytes = Bytes.ofHex('c685'); + Assert.equals(2i64, Utf8.decode(bytes.asView(), codepoint)); + Assert.equals('Ζ…'.code, cast codepoint); + + var bytes = Bytes.ofHex('e38390'); + Assert.equals(3i64, Utf8.decode(bytes.asView(), codepoint)); + Assert.equals('バ'.code, cast codepoint); + + var bytes = Bytes.ofHex('f09d84b3'); + Assert.equals(4i64, Utf8.decode(bytes.asView(), codepoint)); + Assert.equals('𝄳'.code, cast codepoint); + } + + public function test_decode_string() { + var bytes = Bytes.ofHex('61'); + Assert.equals('a', Utf8.decode(bytes.asView())); + + var bytes = Bytes.ofHex('c685'); + Assert.equals('Ζ…', Utf8.decode(bytes.asView())); + + var bytes = Bytes.ofHex('e38390'); + Assert.equals('バ', Utf8.decode(bytes.asView())); + + var bytes = Bytes.ofHex('f09d84b3'); + Assert.equals('𝄳', Utf8.decode(bytes.asView())); + } + + public function test_decode_empty_view() { + Assert.raises(() -> Utf8.decode(ViewExtensions.empty())); + } +} \ No newline at end of file diff --git a/toolchain/haxe-target.xml b/toolchain/haxe-target.xml index 8d7362e12..d7bc6cf42 100644 --- a/toolchain/haxe-target.xml +++ b/toolchain/haxe-target.xml @@ -68,6 +68,9 @@ + + + @@ -199,6 +202,10 @@ + + + + From 7bc2d4ee4bb0a8fbd15e346fcf4774565a4d2225 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Mon, 22 Dec 2025 22:09:39 +0000 Subject: [PATCH 02/18] Remove old marshal stuff and return an empty string instead of throwing --- include/cpp/marshal/Definitions.inc | 13 +- include/cpp/marshal/Marshal.hpp | 66 ++--- src/cpp/encoding/ASCII.cpp | 2 +- src/cpp/encoding/UTF16.cpp | 18 +- src/cpp/encoding/UTF8.cpp | 27 +- test/native/cpp/encoding/Ascii.hx | 15 ++ test/native/cpp/encoding/Utf16.hx | 22 ++ test/native/cpp/encoding/Utf8.hx | 20 ++ .../tests/marshalling/view/TestMarshal.hx | 234 ++---------------- 9 files changed, 128 insertions(+), 289 deletions(-) create mode 100644 test/native/cpp/encoding/Ascii.hx create mode 100644 test/native/cpp/encoding/Utf16.hx create mode 100644 test/native/cpp/encoding/Utf8.hx diff --git a/include/cpp/marshal/Definitions.inc b/include/cpp/marshal/Definitions.inc index d4bdc0d00..575e044a5 100644 --- a/include/cpp/marshal/Definitions.inc +++ b/include/cpp/marshal/Definitions.inc @@ -224,17 +224,8 @@ namespace cpp static const bool isBigEndian = false; #endif - static View asView(const char* cstring); - static View asView(const char16_t* cstring); - - static View toCharView(const ::String& string); - static int toCharView(const ::String&, View buffer); - - static View toWideCharView(const ::String& string); - static int toWideCharView(const ::String& string, View buffer); - - static ::String toString(View buffer); - static ::String toString(View buffer); + static View asCharView(const ::String& string); + static View asWideCharView(const ::String& string); template static T read(View view); template static ::cpp::Pointer readPointer(View view); diff --git a/include/cpp/marshal/Marshal.hpp b/include/cpp/marshal/Marshal.hpp index 997cf5091..585c72ba8 100644 --- a/include/cpp/marshal/Marshal.hpp +++ b/include/cpp/marshal/Marshal.hpp @@ -18,72 +18,34 @@ namespace } } -inline cpp::marshal::View cpp::marshal::Marshal::asView(const char* cstring) +inline cpp::marshal::View cpp::marshal::Marshal::asCharView(const ::String& string) { - return cpp::marshal::View(const_cast(cstring), static_cast(std::char_traits::length(cstring))); -} - -inline cpp::marshal::View cpp::marshal::Marshal::asView(const char16_t* cstring) -{ - return cpp::marshal::View(const_cast(cstring), static_cast(std::char_traits::length(cstring))); -} - -inline cpp::marshal::View cpp::marshal::Marshal::toCharView(const ::String& string) -{ - auto length = 0; - auto ptr = string.utf8_str(nullptr, true, &length); - - return View(const_cast(ptr), length + 1); -} - -inline int cpp::marshal::Marshal::toCharView(const ::String& string, View buffer) -{ - auto length = 0; - - if (string.utf8_str(buffer, &length)) + if (null() == string) { - return length; + hx::NullReference("string", false); } - else - { - hx::Throw(HX_CSTRING("Not enough space in the view to write the string")); - return 0; + if (false == string.isAsciiEncoded()) + { + hx::Throw(HX_CSTRING("String is not ASCII encoded")); } -} - -inline cpp::marshal::View cpp::marshal::Marshal::toWideCharView(const ::String& string) -{ - auto length = 0; - auto ptr = string.wc_str(nullptr, &length); - return View(const_cast(ptr), length + 1); + return View(const_cast(string.raw_ptr()), string.length); } -inline int cpp::marshal::Marshal::toWideCharView(const ::String& string, View buffer) +inline cpp::marshal::View cpp::marshal::Marshal::asWideCharView(const ::String& string) { - auto length = 0; - - if (string.wc_str(buffer, &length)) + if (null() == string) { - return length; + hx::NullReference("string", false); } - else - { - hx::Throw(HX_CSTRING("Not enough space in the view to write the string")); - return 0; + if (false == string.isUTF16Encoded()) + { + hx::Throw(HX_CSTRING("String is not ASCII encoded")); } -} - -inline ::String cpp::marshal::Marshal::toString(View buffer) -{ - return ::String::create(buffer); -} -inline ::String cpp::marshal::Marshal::toString(View buffer) -{ - return ::String::create(buffer); + return View(const_cast(string.raw_wptr()), string.length); } template diff --git a/src/cpp/encoding/ASCII.cpp b/src/cpp/encoding/ASCII.cpp index 3115e19cd..7a0acd8bf 100644 --- a/src/cpp/encoding/ASCII.cpp +++ b/src/cpp/encoding/ASCII.cpp @@ -14,7 +14,7 @@ bool cpp::encoding::Ascii::isEncoded(const String& string) int64_t cpp::encoding::Ascii::encode(const String& string, View buffer) { - if (hx::IsNull(string)) + if (null() == string) { hx::NullReference("String", false); } diff --git a/src/cpp/encoding/UTF16.cpp b/src/cpp/encoding/UTF16.cpp index 743629301..9646e32d0 100644 --- a/src/cpp/encoding/UTF16.cpp +++ b/src/cpp/encoding/UTF16.cpp @@ -9,11 +9,21 @@ namespace { return codepoint >= 0xd800 && codepoint < 0xe000; } + + bool isLowSurrogate(char32_t codepoint) + { + return codepoint >= 0xdc00 && codepoint < 0xe000; + } + + bool isHighSurrogate(char32_t codepoint) + { + return codepoint >= 0xd800 && codepoint < 0xdc00; + } } bool cpp::encoding::Utf16::isEncoded(const String& string) { - if (hx::IsNull(string)) + if (null() == string) { hx::NullReference("String", false); } @@ -36,7 +46,7 @@ int32_t cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) int64_t cpp::encoding::Utf16::getByteCount(const String& string) { - if (hx::IsNull(string)) + if (null() == string) { hx::NullReference("String", false); } @@ -59,7 +69,7 @@ int64_t cpp::encoding::Utf16::getByteCount(const String& string) int64_t cpp::encoding::Utf16::encode(const String& string, cpp::marshal::View buffer) { - if (hx::IsNull(string)) + if (null() == string) { hx::NullReference("String", false); } @@ -149,7 +159,7 @@ String cpp::encoding::Utf16::decode(cpp::marshal::View buffer) { if (buffer.isEmpty()) { - return hx::Throw(HX_CSTRING("View empty")); + return String::emptyString; } auto bytes = int64_t{ 0 }; diff --git a/src/cpp/encoding/UTF8.cpp b/src/cpp/encoding/UTF8.cpp index dc13c5c27..841a665e9 100644 --- a/src/cpp/encoding/UTF8.cpp +++ b/src/cpp/encoding/UTF8.cpp @@ -2,6 +2,22 @@ using namespace cpp::marshal; +namespace +{ + bool isAsciiBuffer(View& buffer) + { + for (auto i = int64_t{ 0 }; i < buffer.length; i++) + { + if (buffer.ptr[i] > 127) + { + return false; + } + } + + return true; + } +} + int64_t cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) { if (codepoint <= 0x7F) @@ -24,7 +40,7 @@ int64_t cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) int64_t cpp::encoding::Utf8::getByteCount(const String& string) { - if (hx::IsNull(string)) + if (null() == string) { hx::NullReference("String", false); } @@ -54,7 +70,7 @@ int64_t cpp::encoding::Utf8::getByteCount(const String& string) int64_t cpp::encoding::Utf8::encode(const String& string, cpp::marshal::View buffer) { - if (hx::IsNull(string)) + if (null() == string) { hx::NullReference("String", false); } @@ -148,7 +164,12 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) { if (buffer.isEmpty()) { - return hx::Throw(HX_CSTRING("View empty")); + return String::emptyString; + } + + if (isAsciiBuffer(buffer)) + { + return Ascii::decode(buffer); } auto bytes = int64_t{ 0 }; diff --git a/test/native/cpp/encoding/Ascii.hx b/test/native/cpp/encoding/Ascii.hx new file mode 100644 index 000000000..a96eb54d9 --- /dev/null +++ b/test/native/cpp/encoding/Ascii.hx @@ -0,0 +1,15 @@ +package cpp.encoding; + +import cpp.UInt8; +import cpp.Int64; +import cpp.marshal.View; + +@:semantics(value) +@:cpp.PointerType({ namespace : [ "cpp", "encoding" ] }) +extern class Ascii { + static function isEncoded(string:String):Bool; + + static function encode(string:String, buffer:View):Int64; + + static function decode(buffer:View):String; +} diff --git a/test/native/cpp/encoding/Utf16.hx b/test/native/cpp/encoding/Utf16.hx new file mode 100644 index 000000000..161591202 --- /dev/null +++ b/test/native/cpp/encoding/Utf16.hx @@ -0,0 +1,22 @@ +package cpp.encoding; + +import cpp.UInt8; +import cpp.Int64; +import cpp.Char32; +import cpp.marshal.View; +import haxe.extern.AsVar; + +@:semantics(value) +@:cpp.PointerType({ namespace : [ "cpp", "encoding" ] }) +extern class Utf16 { + static function isEncoded(string:String):Bool; + + static overload function getByteCount(codepoint:Char32):Int64; + static overload function getByteCount(string:String):Int64; + + static overload function encode(string:String, buffer:View):Int64; + static overload function encode(codepoint:Char32, buffer:View):Int64; + + static overload function decode(buffer:View):String; + static overload function decode(buffer:View, codepoint:AsVar):Int64; +} \ No newline at end of file diff --git a/test/native/cpp/encoding/Utf8.hx b/test/native/cpp/encoding/Utf8.hx new file mode 100644 index 000000000..c1461c569 --- /dev/null +++ b/test/native/cpp/encoding/Utf8.hx @@ -0,0 +1,20 @@ +package cpp.encoding; + +import cpp.UInt8; +import cpp.Int64; +import cpp.Char32; +import cpp.marshal.View; +import haxe.extern.AsVar; + +@:semantics(value) +@:cpp.PointerType({ namespace : [ "cpp", "encoding" ] }) +extern class Utf8 { + static overload function getByteCount(codepoint:Char32):Int64; + static overload function getByteCount(string:String):Int64; + + static overload function encode(string:String, buffer:View):Int64; + static overload function encode(codepoint:Char32, buffer:View):Int64; + + static overload function decode(buffer:View):String; + static overload function decode(buffer:View, codepoint:AsVar):Int64; +} \ No newline at end of file diff --git a/test/native/tests/marshalling/view/TestMarshal.hx b/test/native/tests/marshalling/view/TestMarshal.hx index 60199b5be..ad236b094 100644 --- a/test/native/tests/marshalling/view/TestMarshal.hx +++ b/test/native/tests/marshalling/view/TestMarshal.hx @@ -115,233 +115,31 @@ class TestMarshal extends Test { Assert.isTrue(storage == value); } - function test_ascii_string_to_utf8() { - final source = "Hello, World!"; - final view = source.toCharView(); - - if (Assert.equals(source.length + 1, view.length)) { - Assert.equals(view[ 0], "H".code); - Assert.equals(view[ 1], "e".code); - Assert.equals(view[ 2], "l".code); - Assert.equals(view[ 3], "l".code); - Assert.equals(view[ 4], "o".code); - Assert.equals(view[ 5], ",".code); - Assert.equals(view[ 6], " ".code); - Assert.equals(view[ 7], "W".code); - Assert.equals(view[ 8], "o".code); - Assert.equals(view[ 9], "r".code); - Assert.equals(view[10], "l".code); - Assert.equals(view[11], "d".code); - Assert.equals(view[12], "!".code); - Assert.equals(view[13], 0); - } - } - - function test_ascii_string_to_utf8_buffer() { - final source = "Hello, World!"; - final buffer = Bytes.ofHex("FFFFFFFFFFFFFFFFFFFFFFFFFFFF"); - final view = buffer.asView().reinterpret(); - final count = Marshal.toCharView(source, view); - - if (Assert.equals(source.length + 1, count)) { - Assert.equals(view[ 0], "H".code); - Assert.equals(view[ 1], "e".code); - Assert.equals(view[ 2], "l".code); - Assert.equals(view[ 3], "l".code); - Assert.equals(view[ 4], "o".code); - Assert.equals(view[ 5], ",".code); - Assert.equals(view[ 6], " ".code); - Assert.equals(view[ 7], "W".code); - Assert.equals(view[ 8], "o".code); - Assert.equals(view[ 9], "r".code); - Assert.equals(view[10], "l".code); - Assert.equals(view[11], "d".code); - Assert.equals(view[12], "!".code); - Assert.equals(view[13], 0); - } - } - - function test_emoji_string_to_utf8() { - final source = "πŸ˜‚"; - final view = source.toCharView(); - - if (Assert.equals(5, view.length)) { - Assert.equals((0xf0:Char), view[0]); - Assert.equals((0x9f:Char), view[1]); - Assert.equals((0x98:Char), view[2]); - Assert.equals((0x82:Char), view[3]); - Assert.equals(0, view[4]); - } + function test_asCharView_null() { + Assert.raises(() -> Marshal.asCharView(null)); } - function test_emoji_string_to_utf8_buffer() { - final source = "πŸ˜‚"; - final buffer = Bytes.ofHex("FFFFFFFFFF"); - final view = buffer.asView().reinterpret(); - final count = Marshal.toCharView(source, view); - - if (Assert.equals(5, count)) { - Assert.equals((0xf0:Char), view[0]); - Assert.equals((0x9f:Char), view[1]); - Assert.equals((0x98:Char), view[2]); - Assert.equals((0x82:Char), view[3]); - Assert.equals(0, view[4]); - } + function test_asWideCharView_null() { + Assert.raises(() -> Marshal.asWideCharView(null)); } - function test_ascii_string_to_utf16() { - final source = "Hello, World!"; - final view = source.toWideCharView(); - - if (Assert.equals(source.length + 1, view.length)) { - Assert.equals(view[ 0], "H".code); - Assert.equals(view[ 1], "e".code); - Assert.equals(view[ 2], "l".code); - Assert.equals(view[ 3], "l".code); - Assert.equals(view[ 4], "o".code); - Assert.equals(view[ 5], ",".code); - Assert.equals(view[ 6], " ".code); - Assert.equals(view[ 7], "W".code); - Assert.equals(view[ 8], "o".code); - Assert.equals(view[ 9], "r".code); - Assert.equals(view[10], "l".code); - Assert.equals(view[11], "d".code); - Assert.equals(view[12], "!".code); - Assert.equals(view[13], 0); - } + function test_asCharView_wrong_encoding() { + Assert.raises(() -> Marshal.asCharView("πŸ˜‚")); } - function test_ascii_string_to_utf16_buffer() { - final source = "Hello, World!"; - final buffer = Bytes.ofHex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"); - final view = buffer.asView().reinterpret(); - final count = Marshal.toWideCharView(source, view); - - if (Assert.equals(count, view.length)) { - Assert.equals(view[ 0], "H".code); - Assert.equals(view[ 1], "e".code); - Assert.equals(view[ 2], "l".code); - Assert.equals(view[ 3], "l".code); - Assert.equals(view[ 4], "o".code); - Assert.equals(view[ 5], ",".code); - Assert.equals(view[ 6], " ".code); - Assert.equals(view[ 7], "W".code); - Assert.equals(view[ 8], "o".code); - Assert.equals(view[ 9], "r".code); - Assert.equals(view[10], "l".code); - Assert.equals(view[11], "d".code); - Assert.equals(view[12], "!".code); - Assert.equals(view[13], 0); - } - } - - function test_emoji_string_to_utf16() { - final source = "πŸ˜‚"; - final view = source.toWideCharView(); - - if (Assert.equals(3, view.length)) { - Assert.equals((0xD83D:Char16), view[0]); - Assert.equals((0xDE02:Char16), view[1]); - Assert.equals(0, view[2]); - } - } - - function test_emoji_string_to_utf16_buffer() { - final source = "πŸ˜‚"; - final buffer = Bytes.ofHex("FFFFFFFFFFFFFFFF"); - final view = buffer.asView().slice(0, 3 * 2).reinterpret(); - final count = Marshal.toWideCharView(source, view); - - if (Assert.equals(count, view.length)) { - Assert.equals((0xD83D:Char16), view[0]); - Assert.equals((0xDE02:Char16), view[1]); - Assert.equals(0, view[2]); - } - } - - function test_ascii_chars_to_string() { - final buffer = new Vector(5); - buffer[0] = 'H'.code; - buffer[1] = 'e'.code; - buffer[2] = 'l'.code; - buffer[3] = 'l'.code; - buffer[4] = 'o'.code; - final view = buffer.asView(); - final string = view.toString(); - - Assert.equals('Hello', string); + function test_asWideCharView_wrong_encoding() { + Assert.raises(() -> Marshal.asWideCharView("hello")); } - function test_ascii_wide_chars_to_string() { - final buffer = new Vector(5); - buffer[0] = 'H'.code; - buffer[1] = 'e'.code; - buffer[2] = 'l'.code; - buffer[3] = 'l'.code; - buffer[4] = 'o'.code; - final view = buffer.asView(); - final string = view.toString(); - - Assert.equals('Hello', string); - } - - function test_null_terminated_ascii_chars_to_string() { - final buffer = new Vector(5); - buffer[0] = 'H'.code; - buffer[1] = 'e'.code; - buffer[2] = 'l'.code; - buffer[3] = 'l'.code; - buffer[4] = 'o'.code; - buffer[5] = 0; - final view = buffer.asView(); - final string = view.toString(); - - Assert.equals('Hello', string); - } - - function test_null_terminated_ascii_wide_chars_to_string() { - final buffer = new Vector(5); - buffer[0] = 'H'.code; - buffer[1] = 'e'.code; - buffer[2] = 'l'.code; - buffer[3] = 'l'.code; - buffer[4] = 'o'.code; - buffer[5] = 0; - final view = buffer.asView(); - final string = view.toString(); - - Assert.equals('Hello', string); - } - - function test_utf8_bytes_to_string() { - final buffer = Bytes.ofHex("f09f9882"); - final view = (buffer.asView().reinterpret() : View); - final string = view.toString(); - - Assert.equals('πŸ˜‚', string); - } - - function test_null_terminated_utf8_bytes_to_string() { - final buffer = Bytes.ofHex("f09f98820000"); - final view = (buffer.asView().reinterpret() : View); - final string = view.toString(); - - Assert.equals('πŸ˜‚', string); - } - - function test_utf16_bytes_to_string() { - final buffer = Bytes.ofHex("3DD802De"); - final view = (buffer.asView().reinterpret() : View); - final string = view.toString(); - - Assert.equals('πŸ˜‚', string); + function test_asCharView() { + final view = "hello".asCharView(); + + Assert.equals(5, view.length); } - function test_null_terminated_utf16_bytes_to_string() { - final buffer = Bytes.ofHex("3DD802De00000000"); - final view = (buffer.asView().reinterpret() : View); - final string = view.toString(); - - Assert.equals('πŸ˜‚', string); + function test_asWideCharView() { + final view = "πŸ˜‚".asWideCharView(); + + Assert.equals(2, view.length); } } \ No newline at end of file From 6ea2cc27f0377252ec4d894257d3da03940933bc Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Mon, 22 Dec 2025 22:09:59 +0000 Subject: [PATCH 03/18] update tests --- test/native/tests/encoding/TestUtf16.hx | 2 +- test/native/tests/encoding/TestUtf8.hx | 2 +- .../tests/marshalling/view/TestMarshal.hx | 4 ++-- .../native/tests/marshalling/view/TestView.hx | 14 ++++++------- .../marshalling/view/TestViewExtensions.hx | 21 ++++++++++--------- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/test/native/tests/encoding/TestUtf16.hx b/test/native/tests/encoding/TestUtf16.hx index c0e27589a..c970645b8 100644 --- a/test/native/tests/encoding/TestUtf16.hx +++ b/test/native/tests/encoding/TestUtf16.hx @@ -175,6 +175,6 @@ class TestUtf16 extends Test { } public function test_decode_empty_view() { - Assert.raises(() -> Utf16.decode(ViewExtensions.empty())); + Assert.equals("", Utf16.decode(ViewExtensions.empty())); } } \ No newline at end of file diff --git a/test/native/tests/encoding/TestUtf8.hx b/test/native/tests/encoding/TestUtf8.hx index e03f312a9..739633f3c 100644 --- a/test/native/tests/encoding/TestUtf8.hx +++ b/test/native/tests/encoding/TestUtf8.hx @@ -152,6 +152,6 @@ class TestUtf8 extends Test { } public function test_decode_empty_view() { - Assert.raises(() -> Utf8.decode(ViewExtensions.empty())); + Assert.equals("",Utf8.decode(ViewExtensions.empty())); } } \ No newline at end of file diff --git a/test/native/tests/marshalling/view/TestMarshal.hx b/test/native/tests/marshalling/view/TestMarshal.hx index ad236b094..8766560e9 100644 --- a/test/native/tests/marshalling/view/TestMarshal.hx +++ b/test/native/tests/marshalling/view/TestMarshal.hx @@ -134,12 +134,12 @@ class TestMarshal extends Test { function test_asCharView() { final view = "hello".asCharView(); - Assert.equals(5, view.length); + Assert.equals(5i64, view.length); } function test_asWideCharView() { final view = "πŸ˜‚".asWideCharView(); - Assert.equals(2, view.length); + Assert.equals(2i64, view.length); } } \ No newline at end of file diff --git a/test/native/tests/marshalling/view/TestView.hx b/test/native/tests/marshalling/view/TestView.hx index 8eb6c5555..7406d17f3 100644 --- a/test/native/tests/marshalling/view/TestView.hx +++ b/test/native/tests/marshalling/view/TestView.hx @@ -139,8 +139,8 @@ class TestView extends Test { final index = 3; final slice = view.slice(index); - if (Assert.equals(7, slice.length)) { - for (i in 0...slice.length) { + if (Assert.equals(7i64, slice.length)) { + for (i in 0...(cast slice.length : Int)) { Assert.equals(i + index + 1, slice[i]); } } @@ -161,8 +161,8 @@ class TestView extends Test { final length = 4; final slice = view.slice(index, length); - if (Assert.equals(length, slice.length)) { - for (i in 0...slice.length) { + if (Assert.equals(haxe.Int64.ofInt(length), slice.length)) { + for (i in 0...(cast slice.length : Int)) { Assert.equals(i + index + 1, slice[i]); } } @@ -243,7 +243,7 @@ class TestView extends Test { final view = buffer.asView(); final second : View = view.reinterpret(); - Assert.equals(1, second.length); + Assert.equals(1i64, second.length); } function test_reinterpret_to_larger_type_not_enough_length() { @@ -251,7 +251,7 @@ class TestView extends Test { final view = buffer.asView(); final second : View = view.reinterpret(); - Assert.equals(0, second.length); + Assert.equals(0i64, second.length); } function test_reinterpret_to_value_type() { @@ -259,7 +259,7 @@ class TestView extends Test { final view = buffer.asView(); final points = (view.reinterpret() : View); - Assert.equals(2, points.length); + Assert.equals(2i64, points.length); Assert.equals(0f64, points[0].x); Assert.equals(0f64, points[0].y); diff --git a/test/native/tests/marshalling/view/TestViewExtensions.hx b/test/native/tests/marshalling/view/TestViewExtensions.hx index 271b1e103..c9d2e2609 100644 --- a/test/native/tests/marshalling/view/TestViewExtensions.hx +++ b/test/native/tests/marshalling/view/TestViewExtensions.hx @@ -1,5 +1,6 @@ package tests.marshalling.view; +import haxe.Int64; import haxe.io.UInt8Array; import haxe.io.UInt16Array; import haxe.io.UInt32Array; @@ -49,7 +50,7 @@ class TestViewExtensions extends Test { final array = [ 100, 200, 300, 400 ]; final view = array.asView(); - if (Assert.equals(array.length, view.length)) { + if (Assert.equals(Int64.ofInt(array.length), view.length)) { for (i in 0...array.length) { Assert.equals(array[i], view[i]); } @@ -60,7 +61,7 @@ class TestViewExtensions extends Test { final vector = Vector.fromData([ 100, 200, 300, 400 ]); final view = vector.asView(); - if (Assert.equals(vector.length, view.length)) { + if (Assert.equals(Int64.ofInt(vector.length), view.length)) { for (i in 0...vector.length) { Assert.equals(vector[i], view[i]); } @@ -71,7 +72,7 @@ class TestViewExtensions extends Test { final bytes = Bytes.ofData([ 10, 20, 30, 40 ]); final view = bytes.asView(); - if (Assert.equals(bytes.length, view.length)) { + if (Assert.equals(Int64.ofInt(bytes.length), view.length)) { for (i in 0...bytes.length) { Assert.equals(bytes.get(i), view[i]); } @@ -83,7 +84,7 @@ class TestViewExtensions extends Test { final buffer = ArrayBufferView.fromBytes(Bytes.ofData([ for (i in 0...100) i ])).sub(index, 10); final view = buffer.asView(); - if (Assert.equals(buffer.byteLength, view.length)) { + if (Assert.equals(Int64.ofInt(buffer.byteLength), view.length)) { for (i in 0...buffer.byteLength) { Assert.equals(buffer.buffer.get(index + i), view[i]); } @@ -95,7 +96,7 @@ class TestViewExtensions extends Test { final buffer = Float32Array.fromArray([ for (i in 0...100) i ]).sub(index, 10); final view = buffer.asView(); - if (Assert.equals(buffer.length, view.length)) { + if (Assert.equals(Int64.ofInt(buffer.length), view.length)) { for (i in 0...buffer.length) { Assert.equals(buffer[i], view[i]); } @@ -107,7 +108,7 @@ class TestViewExtensions extends Test { final buffer = Float64Array.fromArray([ for (i in 0...100) i ]).sub(index, 10); final view = buffer.asView(); - if (Assert.equals(buffer.length, view.length)) { + if (Assert.equals(Int64.ofInt(buffer.length), view.length)) { for (i in 0...buffer.length) { Assert.equals(buffer[i], view[i]); } @@ -119,7 +120,7 @@ class TestViewExtensions extends Test { final buffer = Int32Array.fromArray([ for (i in 0...100) i ]).sub(index, 10); final view = buffer.asView(); - if (Assert.equals(buffer.length, view.length)) { + if (Assert.equals(Int64.ofInt(buffer.length), view.length)) { for (i in 0...buffer.length) { Assert.equals(buffer[i], view[i]); } @@ -131,7 +132,7 @@ class TestViewExtensions extends Test { final buffer = UInt32Array.fromArray([ for (i in 0...100) i ]).sub(index, 10); final view = buffer.asView(); - if (Assert.equals(buffer.length, view.length)) { + if (Assert.equals(Int64.ofInt(buffer.length), view.length)) { for (i in 0...buffer.length) { Assert.equals(buffer[i], view[i]); } @@ -143,7 +144,7 @@ class TestViewExtensions extends Test { final buffer = UInt16Array.fromArray([ for (i in 0...100) i ]).sub(index, 10); final view = buffer.asView(); - if (Assert.equals(buffer.length, view.length)) { + if (Assert.equals(Int64.ofInt(buffer.length), view.length)) { for (i in 0...buffer.length) { Assert.equals(buffer[i], view[i]); } @@ -155,7 +156,7 @@ class TestViewExtensions extends Test { final buffer = UInt8Array.fromArray([ for (i in 0...100) i ]).sub(index, 10); final view = buffer.asView(); - if (Assert.equals(buffer.length, view.length)) { + if (Assert.equals(Int64.ofInt(buffer.length), view.length)) { for (i in 0...buffer.length) { Assert.equals(buffer[i], view[i]); } From c39a54cac5676c6c5a19be05917538ebe0178209 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 10:19:29 +0000 Subject: [PATCH 04/18] Case change --- include/cpp/encoding/{ASCII.hpp => Ascii.hpp} | 0 include/cpp/encoding/{UTF16.hpp => Utf16.hpp} | 0 include/cpp/encoding/{UTF8.hpp => Utf8.hpp} | 0 include/hxcpp.h | 3 +++ src/cpp/encoding/{ASCII.cpp => Ascii.cpp} | 0 src/cpp/encoding/{UTF16.cpp => Utf16.cpp} | 0 src/cpp/encoding/{UTF8.cpp => Utf8.cpp} | 0 toolchain/haxe-target.xml | 12 ++++++------ 8 files changed, 9 insertions(+), 6 deletions(-) rename include/cpp/encoding/{ASCII.hpp => Ascii.hpp} (100%) rename include/cpp/encoding/{UTF16.hpp => Utf16.hpp} (100%) rename include/cpp/encoding/{UTF8.hpp => Utf8.hpp} (100%) rename src/cpp/encoding/{ASCII.cpp => Ascii.cpp} (100%) rename src/cpp/encoding/{UTF16.cpp => Utf16.cpp} (100%) rename src/cpp/encoding/{UTF8.cpp => Utf8.cpp} (100%) diff --git a/include/cpp/encoding/ASCII.hpp b/include/cpp/encoding/Ascii.hpp similarity index 100% rename from include/cpp/encoding/ASCII.hpp rename to include/cpp/encoding/Ascii.hpp diff --git a/include/cpp/encoding/UTF16.hpp b/include/cpp/encoding/Utf16.hpp similarity index 100% rename from include/cpp/encoding/UTF16.hpp rename to include/cpp/encoding/Utf16.hpp diff --git a/include/cpp/encoding/UTF8.hpp b/include/cpp/encoding/Utf8.hpp similarity index 100% rename from include/cpp/encoding/UTF8.hpp rename to include/cpp/encoding/Utf8.hpp diff --git a/include/hxcpp.h b/include/hxcpp.h index 68824a682..71618c1b3 100755 --- a/include/hxcpp.h +++ b/include/hxcpp.h @@ -358,6 +358,9 @@ typedef PropertyAccessMode PropertyAccess; #include #include #include +#include +#include +#include #include #include #include diff --git a/src/cpp/encoding/ASCII.cpp b/src/cpp/encoding/Ascii.cpp similarity index 100% rename from src/cpp/encoding/ASCII.cpp rename to src/cpp/encoding/Ascii.cpp diff --git a/src/cpp/encoding/UTF16.cpp b/src/cpp/encoding/Utf16.cpp similarity index 100% rename from src/cpp/encoding/UTF16.cpp rename to src/cpp/encoding/Utf16.cpp diff --git a/src/cpp/encoding/UTF8.cpp b/src/cpp/encoding/Utf8.cpp similarity index 100% rename from src/cpp/encoding/UTF8.cpp rename to src/cpp/encoding/Utf8.cpp diff --git a/toolchain/haxe-target.xml b/toolchain/haxe-target.xml index d7bc6cf42..09d933727 100644 --- a/toolchain/haxe-target.xml +++ b/toolchain/haxe-target.xml @@ -68,9 +68,9 @@ - - - + + + @@ -202,9 +202,9 @@ - - - + + + From bbc622bd3cb60b2ac8b7af559fd49e4538fc3cf7 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 10:19:41 +0000 Subject: [PATCH 05/18] Update utf16 codepoint encoder --- src/cpp/encoding/Utf16.cpp | 40 ++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp index 9646e32d0..c2596a9e6 100644 --- a/src/cpp/encoding/Utf16.cpp +++ b/src/cpp/encoding/Utf16.cpp @@ -121,38 +121,36 @@ int64_t cpp::encoding::Utf16::encode(const String& string, cpp::marshal::View buffer) { - if (codepoint >= 0x10000) + if (codepoint < 0xD800) { - auto over = codepoint - 0x10000; - if (over >= 0x10000) - { - Marshal::writeUInt16(buffer, 0xFFFD); - - return 2; - } - else - { - auto staging = std::array(); - staging[0] = (over >> 10) + 0xD800; - staging[1] = (over & 0x3FF) + 0xDC00; + Marshal::writeUInt16(buffer, static_cast(codepoint)); - Marshal::writeUInt32(buffer, *reinterpret_cast(staging.data())); + return 2; + } + else if (codepoint < 0xE000) + { + // D800 - DFFF is invalid - return 4; - } + return hx::Throw(HX_CSTRING("Invalid UTF16")); } - else if (isSurrogate(codepoint)) + else if (codepoint < 0x10000) { - Marshal::writeUInt16(buffer, 0xFFFD); + Marshal::writeUInt16(buffer, static_cast(codepoint)); return 2; } - else + else if (codepoint < 0x110000) { - Marshal::writeUInt16(buffer, static_cast(codepoint)); + auto staging = std::array(); + staging[0] = 0xD800 + (((codepoint - 0x10000) >> 10) & 0x3FF); + staging[1] = 0xDC00 + ((codepoint - 0x10000) & 0x3FF); - return 2; + Marshal::writeUInt32(buffer, *reinterpret_cast(staging.data())); + + return 4; } + + return 0; } String cpp::encoding::Utf16::decode(cpp::marshal::View buffer) From bfe921dcc32dc0ac23c4dc6b25f800401309f9a8 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 12:14:15 +0000 Subject: [PATCH 06/18] Add some extra functions --- include/cpp/encoding/Utf16.hpp | 3 +++ include/cpp/encoding/Utf8.hpp | 19 +++---------------- src/cpp/encoding/Utf16.cpp | 10 ++++++++++ src/cpp/encoding/Utf8.cpp | 10 ++++++++++ 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/include/cpp/encoding/Utf16.hpp b/include/cpp/encoding/Utf16.hpp index 9bb3bcc36..4a4750590 100644 --- a/include/cpp/encoding/Utf16.hpp +++ b/include/cpp/encoding/Utf16.hpp @@ -11,6 +11,9 @@ namespace cpp static int32_t getByteCount(const char32_t& codepoint); static int64_t getByteCount(const String& string); + static int64_t getCharCount(const char32_t& codepoint); + static int64_t getCharCount(const String& string); + static int64_t encode(const String& string, cpp::marshal::View buffer); static int64_t encode(const char32_t& codepoint, cpp::marshal::View buffer); diff --git a/include/cpp/encoding/Utf8.hpp b/include/cpp/encoding/Utf8.hpp index db06a9251..dbd2c4ba3 100644 --- a/include/cpp/encoding/Utf8.hpp +++ b/include/cpp/encoding/Utf8.hpp @@ -6,26 +6,13 @@ namespace cpp { struct Utf8 final { - /// - /// Returns the number of bytes required to store the codepoint in it's UTF8 form. - /// static int64_t getByteCount(const char32_t& codepoint); - - /// - /// Returns the number of bytes required to store the string in it's UTF8 form. - /// static int64_t getByteCount(const String& string); - /// - /// Writes the provided string in it's UTF8 form to the buffer. - /// - /// Number of byte written into the buffer - static int64_t encode(const String& string, cpp::marshal::View buffer); + static int64_t getCharCount(const char32_t& codepoint); + static int64_t getCharCount(const String& string); - /// - /// Writes the provided code point in it's UTF8 form to the buffer. - /// - /// Number of byte written into the buffer + static int64_t encode(const String& string, cpp::marshal::View buffer); static int64_t encode(const char32_t& codepoint, cpp::marshal::View buffer); static String decode(cpp::marshal::View buffer); diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp index c2596a9e6..8280ce5a1 100644 --- a/src/cpp/encoding/Utf16.cpp +++ b/src/cpp/encoding/Utf16.cpp @@ -67,6 +67,16 @@ int64_t cpp::encoding::Utf16::getByteCount(const String& string) } } +int64_t cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) +{ + return getByteCount(codepoint) / sizeof(char16_t); +} + +int64_t cpp::encoding::Utf16::getCharCount(const String& string) +{ + return getByteCount(string) / sizeof(char16_t); +} + int64_t cpp::encoding::Utf16::encode(const String& string, cpp::marshal::View buffer) { if (null() == string) diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 841a665e9..42bca5d4e 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -68,6 +68,16 @@ int64_t cpp::encoding::Utf8::getByteCount(const String& string) #endif } +int64_t cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) +{ + return getByteCount(codepoint) / sizeof(char); +} + +int64_t cpp::encoding::Utf8::getCharCount(const String& string) +{ + return getByteCount(string) / sizeof(char); +} + int64_t cpp::encoding::Utf8::encode(const String& string, cpp::marshal::View buffer) { if (null() == string) From f9fb825be534e9fd61f6a5b765a198aa6af03c8f Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 13:23:19 +0000 Subject: [PATCH 07/18] copyTo function for view --- include/cpp/marshal/Definitions.inc | 1 + include/cpp/marshal/View.hpp | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/include/cpp/marshal/Definitions.inc b/include/cpp/marshal/Definitions.inc index 575e044a5..af0da5928 100644 --- a/include/cpp/marshal/Definitions.inc +++ b/include/cpp/marshal/Definitions.inc @@ -202,6 +202,7 @@ namespace cpp bool isEmpty(); View slice(int64_t index); View slice(int64_t index, int64_t length); + void copyTo(const View& destination); bool tryCopyTo(const View& destination); template View reinterpret(); int compare(const View& inRHS); diff --git a/include/cpp/marshal/View.hpp b/include/cpp/marshal/View.hpp index f62e6ad07..fddefd083 100644 --- a/include/cpp/marshal/View.hpp +++ b/include/cpp/marshal/View.hpp @@ -20,6 +20,15 @@ inline bool cpp::marshal::View::tryCopyTo(const View& destination) return true; } +template +inline void cpp::marshal::View::copyTo(const View& destination) +{ + if (tryCopyTo(destination) == false) + { + hx::Throw(HX_CSTRING("View OOB")); + } +} + template inline void cpp::marshal::View::clear() { From 12f7a7b8146dafe9935349274116fb0d681e2f7f Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 13:23:52 +0000 Subject: [PATCH 08/18] single bounds check for utf8 encode --- src/cpp/encoding/Utf8.cpp | 54 +++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 42bca5d4e..3471ffc70 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -1,4 +1,5 @@ #include +#include using namespace cpp::marshal; @@ -133,40 +134,55 @@ int64_t cpp::encoding::Utf8::encode(const String& string, cpp::marshal::View buffer) { - if (getByteCount(codepoint) > buffer.length) - { - hx::Throw(HX_CSTRING("Buffer too small")); - } - if (codepoint <= 0x7F) { - buffer.ptr[0] = codepoint; + buffer[0] = codepoint; return 1; } else if (codepoint <= 0x7FF) { - buffer.ptr[0] = (0xC0 | (codepoint >> 6)); - buffer.ptr[1] = (0x80 | (codepoint & 63)); - - return 2; + auto data = std::array + { { + static_cast(0xC0 | (codepoint >> 6)), + static_cast(0x80 | (codepoint & 63)) + } }; + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); } else if (codepoint <= 0xFFFF) { - buffer.ptr[0] = (0xE0 | (codepoint >> 12)); - buffer.ptr[1] = (0x80 | ((codepoint >> 6) & 63)); - buffer.ptr[2] = (0x80 | (codepoint & 63)); + auto data = std::array + { { + static_cast(0xE0 | (codepoint >> 12)), + static_cast(0x80 | ((codepoint >> 6) & 63)), + static_cast(0x80 | (codepoint & 63)) + } }; - return 3; + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); } else { - buffer.ptr[0] = (0xF0 | (codepoint >> 18)); - buffer.ptr[1] = (0x80 | ((codepoint >> 12) & 63)); - buffer.ptr[2] = (0x80 | ((codepoint >> 6) & 63)); - buffer.ptr[3] = (0x80 | (codepoint & 63)); + auto data = std::array + { { + static_cast(0xF0 | (codepoint >> 18)), + static_cast(0x80 | ((codepoint >> 12) & 63)), + static_cast(0x80 | ((codepoint >> 6) & 63)), + static_cast(0x80 | (codepoint & 63)) + } }; - return 4; + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); } } From f37b1b96bd6ceab714580ca22fff152c2db6344b Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 14:04:52 +0000 Subject: [PATCH 09/18] marshal writes --- src/cpp/encoding/Utf16.cpp | 12 ++++++++---- src/cpp/encoding/Utf8.cpp | 27 ++++++++++++++++----------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp index 8280ce5a1..c61757219 100644 --- a/src/cpp/encoding/Utf16.cpp +++ b/src/cpp/encoding/Utf16.cpp @@ -151,11 +151,15 @@ int64_t cpp::encoding::Utf16::encode(const char32_t& codepoint, cpp::marshal::Vi } else if (codepoint < 0x110000) { - auto staging = std::array(); - staging[0] = 0xD800 + (((codepoint - 0x10000) >> 10) & 0x3FF); - staging[1] = 0xDC00 + ((codepoint - 0x10000) & 0x3FF); + auto staging = std::array(); + auto fst = View(staging.data(), 2); + auto snd = View(staging.data() + 2, 2); + auto all = View(staging.data(), staging.size()); - Marshal::writeUInt32(buffer, *reinterpret_cast(staging.data())); + Marshal::writeUInt16(fst, 0xD800 + (((codepoint - 0x10000) >> 10) & 0x3FF)); + Marshal::writeUInt16(snd, 0xDC00 + ((codepoint - 0x10000) & 0x3FF)); + + all.copyTo(buffer); return 4; } diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 3471ffc70..7b768ba9c 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -224,12 +224,7 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) int64_t cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& codepoint) { - if (0 == buffer.length) - { - return hx::Throw(HX_CSTRING("Empty view")); - } - - auto b0 = buffer[0]; + auto& b0 = buffer[0]; if ((b0 & 0x80) == 0) { @@ -239,23 +234,33 @@ int64_t cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t } else if ((b0 & 0xE0) == 0xC0) { - codepoint = (static_cast(b0 & 0x1F) << 6) | static_cast(buffer[1] & 0x3F); + codepoint = (static_cast(b0 & 0x1F) << 6) | static_cast(buffer.slice(1)[0] & 0x3F); return 2; } else if ((b0 & 0xF0) == 0xE0) { - codepoint = (static_cast(b0 & 0x0F) << 12) | (static_cast(buffer[1] & 0x3F) << 6) | static_cast(buffer[2] & 0x3F); + auto staging = std::array(); + auto dst = View(staging.data(), staging.size()); + + buffer.slice(1, staging.size()).copyTo(dst); + + codepoint = (static_cast(b0 & 0x0F) << 12) | (static_cast(staging[0] & 0x3F) << 6) | static_cast(staging[1] & 0x3F); return 3; } else if ((b0 & 0xF8) == 0xF0) { + auto staging = std::array(); + auto dst = View(staging.data(), staging.size()); + + buffer.slice(1, staging.size()).copyTo(dst); + codepoint = (static_cast(b0 & 0x07) << 18) | - (static_cast(buffer[1] & 0x3F) << 12) | - (static_cast(buffer[2] & 0x3F) << 6) | - static_cast(buffer[3] & 0x3F); + (static_cast(staging[0] & 0x3F) << 12) | + (static_cast(staging[1] & 0x3F) << 6) | + static_cast(staging[2] & 0x3F); return 4; } From 6384c4563474160b4d8b8868f79004f98c644401 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 14:51:00 +0000 Subject: [PATCH 10/18] remove some conversion issues --- src/cpp/encoding/Utf16.cpp | 4 ++-- src/cpp/encoding/Utf8.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp index c61757219..aaff4bff6 100644 --- a/src/cpp/encoding/Utf16.cpp +++ b/src/cpp/encoding/Utf16.cpp @@ -207,7 +207,7 @@ int64_t cpp::encoding::Utf16::decode(cpp::marshal::View buffer, char32_ auto second = static_cast(Marshal::readUInt16(buffer.slice(2))); if (0xDC00 <= second && second < 0xE000) { - codepoint = ((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); + codepoint = static_cast((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); return 4; } @@ -216,7 +216,7 @@ int64_t cpp::encoding::Utf16::decode(cpp::marshal::View buffer, char32_ } else { - codepoint = first; + codepoint = static_cast(first); return 2; } diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 7b768ba9c..67331ad9b 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -136,7 +136,7 @@ int64_t cpp::encoding::Utf8::encode(const char32_t& codepoint, cpp::marshal::Vie { if (codepoint <= 0x7F) { - buffer[0] = codepoint; + buffer[0] = static_cast(codepoint); return 1; } @@ -224,7 +224,7 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) int64_t cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& codepoint) { - auto& b0 = buffer[0]; + auto b0 = static_cast(buffer[0]); if ((b0 & 0x80) == 0) { From be9ff94012f173a9e259f095507e50ab7cd4b73d Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 15:13:33 +0000 Subject: [PATCH 11/18] Remove un-needed cast --- test/native/compile.hxml | 1 - test/native/tests/marshalling/view/TestView.hx | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/test/native/compile.hxml b/test/native/compile.hxml index a063c9266..825082805 100644 --- a/test/native/compile.hxml +++ b/test/native/compile.hxml @@ -1,4 +1,3 @@ -m Native -L utest --D HXCPP-DEBUGGER --cpp bin \ No newline at end of file diff --git a/test/native/tests/marshalling/view/TestView.hx b/test/native/tests/marshalling/view/TestView.hx index 7406d17f3..104a56e7a 100644 --- a/test/native/tests/marshalling/view/TestView.hx +++ b/test/native/tests/marshalling/view/TestView.hx @@ -140,7 +140,7 @@ class TestView extends Test { final slice = view.slice(index); if (Assert.equals(7i64, slice.length)) { - for (i in 0...(cast slice.length : Int)) { + for (i in 0...haxe.Int64.toInt(slice.length)) { Assert.equals(i + index + 1, slice[i]); } } @@ -162,7 +162,7 @@ class TestView extends Test { final slice = view.slice(index, length); if (Assert.equals(haxe.Int64.ofInt(length), slice.length)) { - for (i in 0...(cast slice.length : Int)) { + for (i in 0...haxe.Int64.toInt(slice.length)) { Assert.equals(i + index + 1, slice[i]); } } From 6ff0e28e903ea62a4d9eb58f513d2b1c24fdb11d Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 15:50:47 +0000 Subject: [PATCH 12/18] Remove questionable implicit view to pointer conversions --- include/cpp/marshal/Definitions.inc | 4 ---- include/cpp/marshal/View.hpp | 18 ------------------ 2 files changed, 22 deletions(-) diff --git a/include/cpp/marshal/Definitions.inc b/include/cpp/marshal/Definitions.inc index af0da5928..fed1d91bd 100644 --- a/include/cpp/marshal/Definitions.inc +++ b/include/cpp/marshal/Definitions.inc @@ -211,10 +211,6 @@ namespace cpp bool operator!=(const View& inRHS) const; T& operator[] (int64_t index); - - operator void* (); - operator T* (); - operator Pointer(); }; struct Marshal final diff --git a/include/cpp/marshal/View.hpp b/include/cpp/marshal/View.hpp index fddefd083..1e54d80b3 100644 --- a/include/cpp/marshal/View.hpp +++ b/include/cpp/marshal/View.hpp @@ -130,22 +130,4 @@ inline T& cpp::marshal::View::operator[](int64_t index) } return ptr[index]; -} - -template -inline cpp::marshal::View::operator void* () -{ - return ptr.ptr; -} - -template -inline cpp::marshal::View::operator T* () -{ - return ptr.ptr; -} - -template -inline cpp::marshal::View::operator cpp::Pointer () -{ - return ptr; } \ No newline at end of file From 03a98d2109b80886be75471b61f88db2565753f2 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 22:22:30 +0000 Subject: [PATCH 13/18] int returns --- include/cpp/encoding/Utf16.hpp | 8 ++++---- include/cpp/encoding/Utf8.hpp | 8 ++++---- src/cpp/encoding/Utf16.cpp | 8 ++++---- src/cpp/encoding/Utf8.cpp | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/cpp/encoding/Utf16.hpp b/include/cpp/encoding/Utf16.hpp index 4a4750590..c28ca3722 100644 --- a/include/cpp/encoding/Utf16.hpp +++ b/include/cpp/encoding/Utf16.hpp @@ -8,17 +8,17 @@ namespace cpp { static bool isEncoded(const String& string); - static int32_t getByteCount(const char32_t& codepoint); + static int getByteCount(const char32_t& codepoint); static int64_t getByteCount(const String& string); - static int64_t getCharCount(const char32_t& codepoint); + static int getCharCount(const char32_t& codepoint); static int64_t getCharCount(const String& string); + static int encode(const char32_t& codepoint, cpp::marshal::View buffer); static int64_t encode(const String& string, cpp::marshal::View buffer); - static int64_t encode(const char32_t& codepoint, cpp::marshal::View buffer); + static int decode(cpp::marshal::View buffer, char32_t& out); static String decode(cpp::marshal::View buffer); - static int64_t decode(cpp::marshal::View buffer, char32_t& out); }; } } \ No newline at end of file diff --git a/include/cpp/encoding/Utf8.hpp b/include/cpp/encoding/Utf8.hpp index dbd2c4ba3..12d83e48e 100644 --- a/include/cpp/encoding/Utf8.hpp +++ b/include/cpp/encoding/Utf8.hpp @@ -6,17 +6,17 @@ namespace cpp { struct Utf8 final { - static int64_t getByteCount(const char32_t& codepoint); + static int getByteCount(const char32_t& codepoint); static int64_t getByteCount(const String& string); - static int64_t getCharCount(const char32_t& codepoint); + static int getCharCount(const char32_t& codepoint); static int64_t getCharCount(const String& string); + static int encode(const char32_t& codepoint, cpp::marshal::View buffer); static int64_t encode(const String& string, cpp::marshal::View buffer); - static int64_t encode(const char32_t& codepoint, cpp::marshal::View buffer); static String decode(cpp::marshal::View buffer); - static int64_t decode(cpp::marshal::View buffer, char32_t& out); + static int decode(cpp::marshal::View buffer, char32_t& out); }; } } \ No newline at end of file diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp index aaff4bff6..fd7796366 100644 --- a/src/cpp/encoding/Utf16.cpp +++ b/src/cpp/encoding/Utf16.cpp @@ -31,7 +31,7 @@ bool cpp::encoding::Utf16::isEncoded(const String& string) return string.isUTF16Encoded(); } -int32_t cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) +int cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) { if (codepoint >= 0x10000) { @@ -67,7 +67,7 @@ int64_t cpp::encoding::Utf16::getByteCount(const String& string) } } -int64_t cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) +int cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) { return getByteCount(codepoint) / sizeof(char16_t); } @@ -129,7 +129,7 @@ int64_t cpp::encoding::Utf16::encode(const String& string, cpp::marshal::View buffer) +int cpp::encoding::Utf16::encode(const char32_t& codepoint, cpp::marshal::View buffer) { if (codepoint < 0xD800) { @@ -198,7 +198,7 @@ String cpp::encoding::Utf16::decode(cpp::marshal::View buffer) return String(reinterpret_cast(backing), bytes / sizeof(char16_t)); } -int64_t cpp::encoding::Utf16::decode(cpp::marshal::View buffer, char32_t& codepoint) +int cpp::encoding::Utf16::decode(cpp::marshal::View buffer, char32_t& codepoint) { auto first = static_cast(Marshal::readUInt16(buffer)); diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 67331ad9b..3d500c36f 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -19,7 +19,7 @@ namespace } } -int64_t cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) +int cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) { if (codepoint <= 0x7F) { @@ -69,7 +69,7 @@ int64_t cpp::encoding::Utf8::getByteCount(const String& string) #endif } -int64_t cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) +int cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) { return getByteCount(codepoint) / sizeof(char); } @@ -132,7 +132,7 @@ int64_t cpp::encoding::Utf8::encode(const String& string, cpp::marshal::View buffer) +int cpp::encoding::Utf8::encode(const char32_t& codepoint, cpp::marshal::View buffer) { if (codepoint <= 0x7F) { @@ -222,7 +222,7 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) return String(reinterpret_cast(backing), bytes / sizeof(char16_t)); } -int64_t cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& codepoint) +int cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& codepoint) { auto b0 = static_cast(buffer[0]); From c006fb469b43d2235f4e49d651872e50ec071840 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 23:22:09 +0000 Subject: [PATCH 14/18] switch to a dedicated codepoint function for utf16 --- include/cpp/encoding/Utf16.hpp | 2 +- src/cpp/encoding/Utf16.cpp | 38 +++++++++++++++------------------- src/cpp/encoding/Utf8.cpp | 14 +++++++------ 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/include/cpp/encoding/Utf16.hpp b/include/cpp/encoding/Utf16.hpp index c28ca3722..0e3d45c4b 100644 --- a/include/cpp/encoding/Utf16.hpp +++ b/include/cpp/encoding/Utf16.hpp @@ -17,7 +17,7 @@ namespace cpp static int encode(const char32_t& codepoint, cpp::marshal::View buffer); static int64_t encode(const String& string, cpp::marshal::View buffer); - static int decode(cpp::marshal::View buffer, char32_t& out); + static char32_t codepoint(cpp::marshal::View buffer); static String decode(cpp::marshal::View buffer); }; } diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp index fd7796366..8682da9e7 100644 --- a/src/cpp/encoding/Utf16.cpp +++ b/src/cpp/encoding/Utf16.cpp @@ -174,31 +174,31 @@ String cpp::encoding::Utf16::decode(cpp::marshal::View buffer) return String::emptyString; } - auto bytes = int64_t{ 0 }; - auto codepoint = char32_t{ 0 }; - auto i = int64_t{ 0 }; - + auto chars = int64_t{ 0 }; + auto i = int64_t{ 0 }; while (i < buffer.length) { - i += decode(buffer.slice(i), codepoint); - bytes += getByteCount(codepoint); + auto p = codepoint(buffer.slice(i)); + + chars += getCharCount(p); + i += getByteCount(p); } - auto backing = static_cast(hx::NewGCPrivate(0, bytes + sizeof(char16_t))); - auto output = View(backing, bytes); + auto backing = View(::String::allocChar16Ptr(chars), chars); + auto output = backing.reinterpret(); while (false == buffer.isEmpty()) { - buffer = buffer.slice(decode(buffer, codepoint)); - output = output.slice(encode(codepoint, output)); - } + auto p = codepoint(buffer); - reinterpret_cast(backing)[-1] |= HX_GC_STRING_CHAR16_T; + buffer = buffer.slice(getByteCount(p)); + output = output.slice(encode(p, output)); + } - return String(reinterpret_cast(backing), bytes / sizeof(char16_t)); + return String(backing.ptr.ptr, chars); } -int cpp::encoding::Utf16::decode(cpp::marshal::View buffer, char32_t& codepoint) +char32_t cpp::encoding::Utf16::codepoint(cpp::marshal::View buffer) { auto first = static_cast(Marshal::readUInt16(buffer)); @@ -207,17 +207,13 @@ int cpp::encoding::Utf16::decode(cpp::marshal::View buffer, char32_t& c auto second = static_cast(Marshal::readUInt16(buffer.slice(2))); if (0xDC00 <= second && second < 0xE000) { - codepoint = static_cast((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); - - return 4; + return static_cast((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); } - return hx::Throw(HX_CSTRING("Invalid UTF16")); + return int{ hx::Throw(HX_CSTRING("Invalid UTF16")) }; } else { - codepoint = static_cast(first); - - return 2; + return static_cast(first); } } diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 3d500c36f..3d1832e11 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -54,13 +54,14 @@ int64_t cpp::encoding::Utf8::getByteCount(const String& string) #if defined(HX_SMART_STRINGS) auto source = View(string.raw_wptr(), string.length).reinterpret(); auto length = source.length; - auto codepoint = char32_t{ 0 }; auto bytes = int64_t{ 0 }; while (false == source.isEmpty()) { - source = source.slice(Utf16::decode(source, codepoint)); - bytes += getByteCount(codepoint); + auto p = Utf16::codepoint(source); + + source = source.slice(Utf16::getByteCount(p)); + bytes += getByteCount(p); } return bytes; @@ -118,12 +119,13 @@ int64_t cpp::encoding::Utf8::encode(const String& string, cpp::marshal::View(string.raw_wptr(), string.length).reinterpret(); - auto codepoint = char32_t{ 0 }; while (false == source.isEmpty()) { - source = source.slice(Utf16::decode(source, codepoint)); - buffer = buffer.slice(encode(codepoint, buffer)); + auto p = Utf16::codepoint(source); + + source = source.slice(Utf16::getByteCount(p)); + buffer = buffer.slice(encode(p, buffer)); } return buffer.ptr.ptr - initialPtr; From 4b5e40a0216086be8e70c586990f98d7a0287229 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Tue, 23 Dec 2025 23:31:00 +0000 Subject: [PATCH 15/18] move to a dedicated codepoint function for utf8 as well --- include/cpp/encoding/Utf8.hpp | 2 +- src/cpp/encoding/Utf8.cpp | 33 ++++++++++++++------------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/include/cpp/encoding/Utf8.hpp b/include/cpp/encoding/Utf8.hpp index 12d83e48e..809d90f68 100644 --- a/include/cpp/encoding/Utf8.hpp +++ b/include/cpp/encoding/Utf8.hpp @@ -15,8 +15,8 @@ namespace cpp static int encode(const char32_t& codepoint, cpp::marshal::View buffer); static int64_t encode(const String& string, cpp::marshal::View buffer); + static char32_t codepoint(cpp::marshal::View buffer); static String decode(cpp::marshal::View buffer); - static int decode(cpp::marshal::View buffer, char32_t& out); }; } } \ No newline at end of file diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 3d1832e11..8553a3f7a 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -201,13 +201,14 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) } auto bytes = int64_t{ 0 }; - auto codepoint = char32_t{ 0 }; auto i = int64_t{ 0 }; while (i < buffer.length) { - i += decode(buffer.slice(i), codepoint); - bytes += Utf16::getByteCount(codepoint); + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + bytes += Utf16::getByteCount(p); } auto backing = static_cast(hx::NewGCPrivate(0, bytes + sizeof(char16_t))); @@ -215,8 +216,10 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) while (false == buffer.isEmpty()) { - buffer = buffer.slice(decode(buffer, codepoint)); - output = output.slice(Utf16::encode(codepoint, output)); + auto p = codepoint(buffer.slice(i)); + + buffer = buffer.slice(getByteCount(p)); + output = output.slice(Utf16::encode(p, output)); } reinterpret_cast(backing)[-1] |= HX_GC_STRING_CHAR16_T; @@ -224,21 +227,17 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) return String(reinterpret_cast(backing), bytes / sizeof(char16_t)); } -int cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& codepoint) +char32_t cpp::encoding::Utf8::codepoint(cpp::marshal::View buffer) { auto b0 = static_cast(buffer[0]); if ((b0 & 0x80) == 0) { - codepoint = b0; - - return 1; + return b0; } else if ((b0 & 0xE0) == 0xC0) { - codepoint = (static_cast(b0 & 0x1F) << 6) | static_cast(buffer.slice(1)[0] & 0x3F); - - return 2; + return (static_cast(b0 & 0x1F) << 6) | static_cast(buffer.slice(1)[0] & 0x3F); } else if ((b0 & 0xF0) == 0xE0) { @@ -247,9 +246,7 @@ int cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& co buffer.slice(1, staging.size()).copyTo(dst); - codepoint = (static_cast(b0 & 0x0F) << 12) | (static_cast(staging[0] & 0x3F) << 6) | static_cast(staging[1] & 0x3F); - - return 3; + return (static_cast(b0 & 0x0F) << 12) | (static_cast(staging[0] & 0x3F) << 6) | static_cast(staging[1] & 0x3F); } else if ((b0 & 0xF8) == 0xF0) { @@ -258,16 +255,14 @@ int cpp::encoding::Utf8::decode(cpp::marshal::View buffer, char32_t& co buffer.slice(1, staging.size()).copyTo(dst); - codepoint = + return (static_cast(b0 & 0x07) << 18) | (static_cast(staging[0] & 0x3F) << 12) | (static_cast(staging[1] & 0x3F) << 6) | static_cast(staging[2] & 0x3F); - - return 4; } else { - return hx::Throw(HX_CSTRING("Failed to read codepoint")); + return int{ hx::Throw(HX_CSTRING("Failed to read codepoint")) }; } } \ No newline at end of file From 485e8b7c39ab005fc7c56066769105ac7ffcec8a Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Wed, 24 Dec 2025 10:20:59 +0000 Subject: [PATCH 16/18] Fix incorrect index reuse --- .gitignore | 406 ++++++++++++++++++++++++++++++++++++++ src/cpp/encoding/Utf8.cpp | 16 +- 2 files changed, 413 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index fcc9ac911..4ff620b27 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,409 @@ hxcpp.n *.ilk .vscode + +# Created by https://www.toptal.com/developers/gitignore/api/visualstudio +# Edit at https://www.toptal.com/developers/gitignore?templates=visualstudio + +### VisualStudio ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.log +*.tlog +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.sdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*~ +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat +node_modules/ + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio 6 auto-generated project file (contains which files were open etc.) +*.vbp + +# Visual Studio 6 workspace and project file (working project files containing files to include in project) +*.dsw +*.dsp + +# Visual Studio 6 technical files + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +__pycache__/ +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# Visual Studio History (VSHistory) files +.vshistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +# VS Code files for those working on multiple tools +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +*.code-workspace + +# Local History for Visual Studio Code +.history/ + +# Windows Installer files from build outputs +*.cab +*.msi +*.msix +*.msm +*.msp + +# JetBrains Rider +*.sln.iml + +### VisualStudio Patch ### +# Additional files built by Visual Studio + +# End of https://www.toptal.com/developers/gitignore/api/visualstudio diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 8553a3f7a..27096e591 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -200,31 +200,29 @@ String cpp::encoding::Utf8::decode(cpp::marshal::View buffer) return Ascii::decode(buffer); } - auto bytes = int64_t{ 0 }; - auto i = int64_t{ 0 }; + auto chars = int64_t{ 0 }; + auto i = int64_t{ 0 }; while (i < buffer.length) { auto p = codepoint(buffer.slice(i)); i += getByteCount(p); - bytes += Utf16::getByteCount(p); + chars += Utf16::getCharCount(p); } - auto backing = static_cast(hx::NewGCPrivate(0, bytes + sizeof(char16_t))); - auto output = View(backing, bytes); + auto backing = View(::String::allocChar16Ptr(chars), chars); + auto output = backing.reinterpret(); while (false == buffer.isEmpty()) { - auto p = codepoint(buffer.slice(i)); + auto p = codepoint(buffer); buffer = buffer.slice(getByteCount(p)); output = output.slice(Utf16::encode(p, output)); } - reinterpret_cast(backing)[-1] |= HX_GC_STRING_CHAR16_T; - - return String(reinterpret_cast(backing), bytes / sizeof(char16_t)); + return String(backing.ptr.ptr, chars); } char32_t cpp::encoding::Utf8::codepoint(cpp::marshal::View buffer) From 3dbceb9b2e922fcbe4c27b36d5878956521d8ce7 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Wed, 24 Dec 2025 11:33:52 +0000 Subject: [PATCH 17/18] Add new view extension cstring function tests --- src/cpp/encoding/Utf16.cpp | 41 +++++++++++++++ src/cpp/encoding/Utf8.cpp | 10 ++-- test/native/cpp/encoding/Ascii.hx | 15 ------ test/native/cpp/encoding/Utf16.hx | 22 -------- test/native/cpp/encoding/Utf8.hx | 20 -------- test/native/tests/encoding/TestUtf16.hx | 14 ++---- test/native/tests/encoding/TestUtf8.hx | 14 ++---- .../marshalling/view/TestViewExtensions.hx | 50 +++++++++++++++++++ 8 files changed, 106 insertions(+), 80 deletions(-) delete mode 100644 test/native/cpp/encoding/Ascii.hx delete mode 100644 test/native/cpp/encoding/Utf16.hx delete mode 100644 test/native/cpp/encoding/Utf8.hx diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp index 8682da9e7..ca5d742ee 100644 --- a/src/cpp/encoding/Utf16.cpp +++ b/src/cpp/encoding/Utf16.cpp @@ -19,6 +19,42 @@ namespace { return codepoint >= 0xd800 && codepoint < 0xdc00; } + + bool isAsciiBuffer(View buffer) + { + while (buffer.isEmpty() == false) + { + auto p = cpp::encoding::Utf16::codepoint(buffer); + + if (p > 127) + { + return false; + } + + buffer = buffer.slice(cpp::encoding::Utf16::getByteCount(p)); + } + + return true; + } + + String toAsciiString(View buffer) + { + auto bytes = buffer.length / sizeof(char16_t); + auto chars = View(hx::InternalNew(bytes + 1, false), bytes * sizeof(char)); + auto output = chars.reinterpret(); + + while (buffer.isEmpty() == false) + { + auto p = cpp::encoding::Utf16::codepoint(buffer); + + output[0] = static_cast(p); + + buffer = buffer.slice(cpp::encoding::Utf16::getByteCount(p)); + output = output.slice(1); + } + + return String(chars.ptr.ptr, chars.length); + } } bool cpp::encoding::Utf16::isEncoded(const String& string) @@ -174,6 +210,11 @@ String cpp::encoding::Utf16::decode(cpp::marshal::View buffer) return String::emptyString; } + if (isAsciiBuffer(buffer)) + { + return toAsciiString(buffer); + } + auto chars = int64_t{ 0 }; auto i = int64_t{ 0 }; while (i < buffer.length) diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp index 27096e591..b6d0f1407 100644 --- a/src/cpp/encoding/Utf8.cpp +++ b/src/cpp/encoding/Utf8.cpp @@ -5,14 +5,18 @@ using namespace cpp::marshal; namespace { - bool isAsciiBuffer(View& buffer) + bool isAsciiBuffer(View buffer) { - for (auto i = int64_t{ 0 }; i < buffer.length; i++) + while (buffer.isEmpty() == false) { - if (buffer.ptr[i] > 127) + auto p = cpp::encoding::Utf8::codepoint(buffer); + + if (p > 127) { return false; } + + buffer = buffer.slice(cpp::encoding::Utf8::getByteCount(p)); } return true; diff --git a/test/native/cpp/encoding/Ascii.hx b/test/native/cpp/encoding/Ascii.hx deleted file mode 100644 index a96eb54d9..000000000 --- a/test/native/cpp/encoding/Ascii.hx +++ /dev/null @@ -1,15 +0,0 @@ -package cpp.encoding; - -import cpp.UInt8; -import cpp.Int64; -import cpp.marshal.View; - -@:semantics(value) -@:cpp.PointerType({ namespace : [ "cpp", "encoding" ] }) -extern class Ascii { - static function isEncoded(string:String):Bool; - - static function encode(string:String, buffer:View):Int64; - - static function decode(buffer:View):String; -} diff --git a/test/native/cpp/encoding/Utf16.hx b/test/native/cpp/encoding/Utf16.hx deleted file mode 100644 index 161591202..000000000 --- a/test/native/cpp/encoding/Utf16.hx +++ /dev/null @@ -1,22 +0,0 @@ -package cpp.encoding; - -import cpp.UInt8; -import cpp.Int64; -import cpp.Char32; -import cpp.marshal.View; -import haxe.extern.AsVar; - -@:semantics(value) -@:cpp.PointerType({ namespace : [ "cpp", "encoding" ] }) -extern class Utf16 { - static function isEncoded(string:String):Bool; - - static overload function getByteCount(codepoint:Char32):Int64; - static overload function getByteCount(string:String):Int64; - - static overload function encode(string:String, buffer:View):Int64; - static overload function encode(codepoint:Char32, buffer:View):Int64; - - static overload function decode(buffer:View):String; - static overload function decode(buffer:View, codepoint:AsVar):Int64; -} \ No newline at end of file diff --git a/test/native/cpp/encoding/Utf8.hx b/test/native/cpp/encoding/Utf8.hx deleted file mode 100644 index c1461c569..000000000 --- a/test/native/cpp/encoding/Utf8.hx +++ /dev/null @@ -1,20 +0,0 @@ -package cpp.encoding; - -import cpp.UInt8; -import cpp.Int64; -import cpp.Char32; -import cpp.marshal.View; -import haxe.extern.AsVar; - -@:semantics(value) -@:cpp.PointerType({ namespace : [ "cpp", "encoding" ] }) -extern class Utf8 { - static overload function getByteCount(codepoint:Char32):Int64; - static overload function getByteCount(string:String):Int64; - - static overload function encode(string:String, buffer:View):Int64; - static overload function encode(codepoint:Char32, buffer:View):Int64; - - static overload function decode(buffer:View):String; - static overload function decode(buffer:View, codepoint:AsVar):Int64; -} \ No newline at end of file diff --git a/test/native/tests/encoding/TestUtf16.hx b/test/native/tests/encoding/TestUtf16.hx index c970645b8..8198c0152 100644 --- a/test/native/tests/encoding/TestUtf16.hx +++ b/test/native/tests/encoding/TestUtf16.hx @@ -141,23 +141,17 @@ class TestUtf16 extends Test { } public function test_decode_codepoint() { - var codepoint : cpp.Char32 = 0; - var bytes = Bytes.ofHex('6100'); - Assert.equals(2i64, Utf16.decode(bytes.asView(), codepoint)); - Assert.equals('a'.code, cast codepoint); + Assert.equals('a'.code, Utf16.codepoint(bytes.asView())); var bytes = Bytes.ofHex('8501'); - Assert.equals(2i64, Utf16.decode(bytes.asView(), codepoint)); - Assert.equals('Ζ…'.code, cast codepoint); + Assert.equals('Ζ…'.code, Utf16.codepoint(bytes.asView())); var bytes = Bytes.ofHex('D030'); - Assert.equals(2i64, Utf16.decode(bytes.asView(), codepoint)); - Assert.equals('バ'.code, cast codepoint); + Assert.equals('バ'.code, Utf16.codepoint(bytes.asView())); var bytes = Bytes.ofHex('34D833DD'); - Assert.equals(4i64, Utf16.decode(bytes.asView(), codepoint)); - Assert.equals('𝄳'.code, cast codepoint); + Assert.equals('𝄳'.code, Utf16.codepoint(bytes.asView())); } public function test_decode_string() { diff --git a/test/native/tests/encoding/TestUtf8.hx b/test/native/tests/encoding/TestUtf8.hx index 739633f3c..716646709 100644 --- a/test/native/tests/encoding/TestUtf8.hx +++ b/test/native/tests/encoding/TestUtf8.hx @@ -118,23 +118,17 @@ class TestUtf8 extends Test { } public function test_decode_codepoint() { - var codepoint : cpp.Char32 = 0; - var bytes = Bytes.ofHex('61'); - Assert.equals(1i64, Utf8.decode(bytes.asView(), codepoint)); - Assert.equals('a'.code, cast codepoint); + Assert.equals('a'.code, Utf8.codepoint(bytes.asView())); var bytes = Bytes.ofHex('c685'); - Assert.equals(2i64, Utf8.decode(bytes.asView(), codepoint)); - Assert.equals('Ζ…'.code, cast codepoint); + Assert.equals('Ζ…'.code, Utf8.codepoint(bytes.asView())); var bytes = Bytes.ofHex('e38390'); - Assert.equals(3i64, Utf8.decode(bytes.asView(), codepoint)); - Assert.equals('バ'.code, cast codepoint); + Assert.equals('バ'.code, Utf8.codepoint(bytes.asView())); var bytes = Bytes.ofHex('f09d84b3'); - Assert.equals(4i64, Utf8.decode(bytes.asView(), codepoint)); - Assert.equals('𝄳'.code, cast codepoint); + Assert.equals('𝄳'.code, Utf8.codepoint(bytes.asView())); } public function test_decode_string() { diff --git a/test/native/tests/marshalling/view/TestViewExtensions.hx b/test/native/tests/marshalling/view/TestViewExtensions.hx index c9d2e2609..e9ff3f942 100644 --- a/test/native/tests/marshalling/view/TestViewExtensions.hx +++ b/test/native/tests/marshalling/view/TestViewExtensions.hx @@ -162,4 +162,54 @@ class TestViewExtensions extends Test { } } } + + function test_szToString_char_no_null() { + final vec = new Vector(4); + vec[0] = 't'.code; + vec[1] = 'e'.code; + vec[2] = 's'.code; + vec[3] = 't'.code; + + Assert.equals("test", vec.asView().szToString()); + } + + function test_szToString_char() { + final vec = new Vector(9); + vec[0] = 't'.code; + vec[1] = 'e'.code; + vec[2] = 's'.code; + vec[3] = 't'.code; + vec[4] = 0; + vec[5] = 't'.code; + vec[6] = 'e'.code; + vec[7] = 's'.code; + vec[8] = 't'.code; + + Assert.equals("test", vec.asView().szToString()); + } + + function test_szToString_char16_no_null() { + final vec = new Vector(4); + vec[0] = 't'.code; + vec[1] = 'e'.code; + vec[2] = 's'.code; + vec[3] = 't'.code; + + Assert.equals("test", vec.asView().szToString()); + } + + function test_szToString16_char() { + final vec = new Vector(9); + vec[0] = 't'.code; + vec[1] = 'e'.code; + vec[2] = 's'.code; + vec[3] = 't'.code; + vec[4] = 0; + vec[5] = 't'.code; + vec[6] = 'e'.code; + vec[7] = 's'.code; + vec[8] = 't'.code; + + Assert.equals("test", vec.asView().szToString()); + } } \ No newline at end of file From 74e7e5725c11f92b3f38b951ef2a0d15e5d7b350 Mon Sep 17 00:00:00 2001 From: Aidan Lee Date: Wed, 24 Dec 2025 12:31:53 +0000 Subject: [PATCH 18/18] Add a smart strings guard --- include/cpp/marshal/Marshal.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/cpp/marshal/Marshal.hpp b/include/cpp/marshal/Marshal.hpp index 585c72ba8..4bb3076be 100644 --- a/include/cpp/marshal/Marshal.hpp +++ b/include/cpp/marshal/Marshal.hpp @@ -35,6 +35,7 @@ inline cpp::marshal::View cpp::marshal::Marshal::asCharView(const ::String inline cpp::marshal::View cpp::marshal::Marshal::asWideCharView(const ::String& string) { +#if defined(HX_SMART_STRINGS) if (null() == string) { hx::NullReference("string", false); @@ -46,6 +47,9 @@ inline cpp::marshal::View cpp::marshal::Marshal::asWideCharView(const } return View(const_cast(string.raw_wptr()), string.length); +#else + return hx::Throw(HX_CSTRING("HX_SMART_STRINGS not defined")); +#endif } template