From e121aaa974e8863a862b6c6e5272ed867052926d Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 17:54:37 +0000 Subject: [PATCH 1/6] Merge encodings into a single file The decode methods use methods from other encodings. For example, Utf8::decode calls Utf16::getCharCount and Utf16::encode in a loop. Placing them in the same file makes it easier for them to be inlined which improves performance. --- src/cpp/encoding/Ascii.cpp | 66 ---- src/cpp/encoding/Encodings.cpp | 641 +++++++++++++++++++++++++++++++++ src/cpp/encoding/Utf16.cpp | 279 -------------- src/cpp/encoding/Utf8.cpp | 303 ---------------- toolchain/haxe-target.xml | 10 +- 5 files changed, 644 insertions(+), 655 deletions(-) delete mode 100644 src/cpp/encoding/Ascii.cpp create mode 100644 src/cpp/encoding/Encodings.cpp delete mode 100644 src/cpp/encoding/Utf16.cpp delete mode 100644 src/cpp/encoding/Utf8.cpp diff --git a/src/cpp/encoding/Ascii.cpp b/src/cpp/encoding/Ascii.cpp deleted file mode 100644 index 7a0acd8bf..000000000 --- a/src/cpp/encoding/Ascii.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include - -using namespace cpp::marshal; - -bool cpp::encoding::Ascii::isEncoded(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - return string.isAsciiEncoded(); -} - -int64_t cpp::encoding::Ascii::encode(const String& string, View buffer) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (string.isUTF16Encoded()) - { - hx::Throw(HX_CSTRING("String cannot be encoded to ASCII")); - } - - auto src = cpp::marshal::View(string.raw_ptr(), string.length).reinterpret(); - - if (src.tryCopyTo(buffer)) - { - return src.length; - } - else - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } -} - -String cpp::encoding::Ascii::decode(View view) -{ - if (view.isEmpty()) - { - return hx::Throw(HX_CSTRING("View is empty")); - } - - auto bytes = int64_t{ 0 }; - auto i = int64_t{ 0 }; - auto chars = view.reinterpret(); - - while (i < chars.length && 0 != chars.ptr[i]) - { - bytes += sizeof(char); - i++; - } - - if (0 == bytes) - { - return String::emptyString; - } - - auto backing = hx::NewGCPrivate(0, bytes + sizeof(char)); - - std::memcpy(backing, view.ptr.ptr, bytes); - - return String(static_cast(backing), bytes / sizeof(char)); -} diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp new file mode 100644 index 000000000..c3d30d314 --- /dev/null +++ b/src/cpp/encoding/Encodings.cpp @@ -0,0 +1,641 @@ +#include +#include + +using namespace cpp::marshal; + +bool cpp::encoding::Ascii::isEncoded(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + return string.isAsciiEncoded(); +} + +int64_t cpp::encoding::Ascii::encode(const String& string, View buffer) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (string.isUTF16Encoded()) + { + hx::Throw(HX_CSTRING("String cannot be encoded to ASCII")); + } + + auto src = cpp::marshal::View(string.raw_ptr(), string.length).reinterpret(); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } +} + +String cpp::encoding::Ascii::decode(View view) +{ + if (view.isEmpty()) + { + return hx::Throw(HX_CSTRING("View is empty")); + } + + auto bytes = int64_t{ 0 }; + auto i = int64_t{ 0 }; + auto chars = view.reinterpret(); + + while (i < chars.length && 0 != chars.ptr[i]) + { + bytes += sizeof(char); + i++; + } + + if (0 == bytes) + { + return String::emptyString; + } + + auto backing = hx::NewGCPrivate(0, bytes + sizeof(char)); + + std::memcpy(backing, view.ptr.ptr, bytes); + + return String(static_cast(backing), bytes / sizeof(char)); +} + +namespace +{ + bool isAsciiUtf8Buffer(const View& buffer) + { + auto i = int64_t{ 0 }; + while (i < buffer.length) + { + auto p = cpp::encoding::Utf8::codepoint(buffer.slice(i)); + + if (p > 127) + { + return false; + } + + i += cpp::encoding::Utf8::getByteCount(p); + } + + return true; + } +} + +int cpp::encoding::Utf8::getByteCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) +{ + if (codepoint <= 0x7F) + { + return 1; + } + else if (codepoint <= 0x7FF) + { + return 2; + } + else if (codepoint <= 0xFFFF) + { + return 3; + } + else + { + return 4; + } +} + +int64_t cpp::encoding::Utf8::getByteCount(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (string.isAsciiEncoded()) + { + return string.length; + } + +#if defined(HX_SMART_STRINGS) + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto length = source.length; + auto bytes = int64_t{ 0 }; + auto i = int64_t{ 0 }; + + while (i < source.length) + { + auto slice = source.slice(i); + auto p = Utf16::codepoint(slice); + + i += Utf16::getByteCount(p); + bytes += getByteCount(p); + } + + return bytes; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + +int cpp::encoding::Utf8::getCharCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) +{ + return getByteCount(codepoint) / sizeof(char); +} + +int64_t cpp::encoding::Utf8::getCharCount(const String& string) +{ + return getByteCount(string) / sizeof(char); +} + +int cpp::encoding::Utf8::encode(const null&, const cpp::marshal::View& buffer) +{ + hx::NullReference("String", false); + return 0; +} + +int64_t cpp::encoding::Utf8::encode(const String& string, const cpp::marshal::View& buffer) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + + if (string.isAsciiEncoded()) + { + auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_ptr())), string.length); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + } + +#if defined(HX_SMART_STRINGS) + if (getByteCount(string) > buffer.length) + { + hx::Throw(HX_CSTRING("Buffer too small")); + } + + auto initialPtr = buffer.ptr.ptr; + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto i = int64_t{ 0 }; + auto k = int64_t{ 0 }; + + while (i < source.length) + { + auto p = Utf16::codepoint(source.slice(i)); + + i += Utf16::getByteCount(p); + k += encode(p, buffer.slice(k)); + } + + return k; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + +int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +{ + if (codepoint <= 0x7F) + { + buffer[0] = static_cast(codepoint); + + return 1; + } + else if (codepoint <= 0x7FF) + { + auto data = std::array + { { + static_cast(0xC0 | (codepoint >> 6)), + static_cast(0x80 | (codepoint & 63)) + } }; + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); + } + else if (codepoint <= 0xFFFF) + { + auto data = std::array + { { + static_cast(0xE0 | (codepoint >> 12)), + static_cast(0x80 | ((codepoint >> 6) & 63)), + static_cast(0x80 | (codepoint & 63)) + } }; + + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); + } + else + { + auto data = std::array + { { + static_cast(0xF0 | (codepoint >> 18)), + static_cast(0x80 | ((codepoint >> 12) & 63)), + static_cast(0x80 | ((codepoint >> 6) & 63)), + static_cast(0x80 | (codepoint & 63)) + } }; + + auto src = View(data.data(), data.size()); + + src.copyTo(buffer); + + return data.size(); + } +} + +String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) +{ + if (buffer.isEmpty()) + { + return String::emptyString; + } + + if (isAsciiUtf8Buffer(buffer)) + { + return Ascii::decode(buffer); + } + +#if defined(HX_SMART_STRINGS) + auto chars = int64_t{ 0 }; + auto i = int64_t{ 0 }; + + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + chars += Utf16::getCharCount(p); + } + + auto backing = View(::String::allocChar16Ptr(chars), chars); + auto output = backing.reinterpret(); + auto k = int64_t{ 0 }; + + i = 0; + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + k += Utf16::encode(p, output.slice(k)); + } + + return String(backing.ptr.ptr, chars); +#else + auto backing = View(hx::InternalNew(buffer.length, false), buffer.length); + + std::memcpy(backing.ptr.ptr, buffer.ptr.ptr, buffer.length); + + return String(backing.ptr.ptr, static_cast(buffer.length)); +#endif +} + +char32_t cpp::encoding::Utf8::codepoint(const cpp::marshal::View& buffer) +{ + auto b0 = static_cast(buffer[0]); + + if ((b0 & 0x80) == 0) + { + return b0; + } + else if ((b0 & 0xE0) == 0xC0) + { + return (static_cast(b0 & 0x1F) << 6) | static_cast(buffer.slice(1)[0] & 0x3F); + } + else if ((b0 & 0xF0) == 0xE0) + { + auto staging = std::array(); + auto dst = View(staging.data(), staging.size()); + + buffer.slice(1, staging.size()).copyTo(dst); + + return (static_cast(b0 & 0x0F) << 12) | (static_cast(staging[0] & 0x3F) << 6) | static_cast(staging[1] & 0x3F); + } + else if ((b0 & 0xF8) == 0xF0) + { + auto staging = std::array(); + auto dst = View(staging.data(), staging.size()); + + buffer.slice(1, staging.size()).copyTo(dst); + + return + (static_cast(b0 & 0x07) << 18) | + (static_cast(staging[0] & 0x3F) << 12) | + (static_cast(staging[1] & 0x3F) << 6) | + static_cast(staging[2] & 0x3F); + } + else + { + return int{ hx::Throw(HX_CSTRING("Failed to read codepoint")) }; + } +} + +namespace +{ + bool isSurrogate(char32_t codepoint) + { + return codepoint >= 0xd800 && codepoint < 0xe000; + } + + bool isLowSurrogate(char32_t codepoint) + { + return codepoint >= 0xdc00 && codepoint < 0xe000; + } + + bool isHighSurrogate(char32_t codepoint) + { + return codepoint >= 0xd800 && codepoint < 0xdc00; + } + + bool isAsciiUtf16Buffer(const View& buffer) + { + auto i = int64_t{ 0 }; + while (i < buffer.length) + { + auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); + + if (p > 127) + { + return false; + } + + i += cpp::encoding::Utf16::getByteCount(p); + } + + return true; + } + + String toAsciiString(const View& buffer) + { + auto bytes = buffer.length / sizeof(char16_t); + auto chars = View(hx::InternalNew(bytes + 1, false), bytes * sizeof(char)); + auto i = int64_t{ 0 }; + auto k = int64_t{ 0 }; + + while (i < buffer.length) + { + auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); + + chars[k++] = static_cast(p); + + i += cpp::encoding::Utf16::getByteCount(p); + } + + return String(chars.ptr.ptr, chars.length); + } +} + +bool cpp::encoding::Utf16::isEncoded(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + return string.isUTF16Encoded(); +} + +int cpp::encoding::Utf16::getByteCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) +{ + return codepoint <= 0xFFFF ? 2 : 4; +} + +int64_t cpp::encoding::Utf16::getByteCount(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (string.isUTF16Encoded()) + { + return string.length * sizeof(char16_t); + } + else + { + auto bytes = int64_t{ 0 }; + for (auto i = 0; i < string.length; i++) + { + bytes += getByteCount(static_cast(string.raw_ptr()[i])); + } + + return bytes; + } +} + +int cpp::encoding::Utf16::getCharCount(const null&) +{ + hx::NullReference("String", false); + return 0; +} + +int cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) +{ + return getByteCount(codepoint) / sizeof(char16_t); +} + +int64_t cpp::encoding::Utf16::getCharCount(const String& string) +{ + return getByteCount(string) / sizeof(char16_t); +} + +int cpp::encoding::Utf16::encode(const null&, const cpp::marshal::View& buffer) +{ + hx::NullReference("String", false); + return 0; +} + +int64_t cpp::encoding::Utf16::encode(const String& string, const cpp::marshal::View& buffer) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (buffer.isEmpty()) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + +#if defined(HX_SMART_STRINGS) + if (string.isUTF16Encoded()) + { + auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_wptr())), string.length * sizeof(char16_t)); + + if (src.tryCopyTo(buffer)) + { + return src.length; + } + else + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + } + else +#endif + { + auto bytes = int64_t{ 0 }; + for (auto i = 0; i < string.length; i++) + { + bytes += getByteCount(static_cast(string.raw_ptr()[i])); + } + + if (bytes > buffer.length) + { + return hx::Throw(HX_CSTRING("Buffer too small")); + } + + auto i = int64_t{ 0 }; + for (auto k = 0; k < string.length; k++) + { + i += encode(static_cast(string.raw_ptr()[k]), buffer.slice(i)); + } + + return bytes; + } +} + +int cpp::encoding::Utf16::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +{ + if (codepoint < 0xD800) + { + Marshal::writeUInt16(buffer, static_cast(codepoint)); + + return 2; + } + else if (codepoint < 0xE000) + { + // D800 - DFFF is invalid + + return hx::Throw(HX_CSTRING("Invalid UTF16")); + } + else if (codepoint < 0x10000) + { + Marshal::writeUInt16(buffer, static_cast(codepoint)); + + return 2; + } + else if (codepoint < 0x110000) + { + auto staging = std::array(); + auto fst = View(staging.data(), 2); + auto snd = View(staging.data() + 2, 2); + auto all = View(staging.data(), staging.size()); + + Marshal::writeUInt16(fst, 0xD800 + (((codepoint - 0x10000) >> 10) & 0x3FF)); + Marshal::writeUInt16(snd, 0xDC00 + ((codepoint - 0x10000) & 0x3FF)); + + all.copyTo(buffer); + + return 4; + } + + return 0; +} + +String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) +{ + if (buffer.isEmpty()) + { + return String::emptyString; + } + + if (isAsciiUtf16Buffer(buffer)) + { + return toAsciiString(buffer); + } + +#if defined(HX_SMART_STRINGS) + auto i = int64_t{ 0 }; + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + } + + auto chars = i / sizeof(char16_t); + auto backing = View(::String::allocChar16Ptr(chars), chars); + auto output = backing.reinterpret(); + auto k = int64_t{ 0 }; + + i = 0; + while (i < buffer.length) + { + auto p = codepoint(buffer.slice(i)); + + i += getByteCount(p); + k += encode(p, output.slice(k)); + } + + return String(backing.ptr.ptr, chars); +#else + return hx::Throw(HX_CSTRING("Not Implemented : UTF16 decode when HX_SMART_STRINGS is not defined")); +#endif +} + +char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) +{ + auto first = static_cast(Marshal::readUInt16(buffer)); + + if (0xD800 <= first && first < 0xDc00) + { + auto second = static_cast(Marshal::readUInt16(buffer.slice(2))); + if (0xDC00 <= second && second < 0xE000) + { + return static_cast((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); + } + + return int{ hx::Throw(HX_CSTRING("Invalid UTF16")) }; + } + else + { + return static_cast(first); + } +} diff --git a/src/cpp/encoding/Utf16.cpp b/src/cpp/encoding/Utf16.cpp deleted file mode 100644 index 930513259..000000000 --- a/src/cpp/encoding/Utf16.cpp +++ /dev/null @@ -1,279 +0,0 @@ -#include -#include - -using namespace cpp::marshal; - -namespace -{ - bool isSurrogate(char32_t codepoint) - { - return codepoint >= 0xd800 && codepoint < 0xe000; - } - - bool isLowSurrogate(char32_t codepoint) - { - return codepoint >= 0xdc00 && codepoint < 0xe000; - } - - bool isHighSurrogate(char32_t codepoint) - { - return codepoint >= 0xd800 && codepoint < 0xdc00; - } - - bool isAsciiBuffer(const View& buffer) - { - auto i = int64_t{ 0 }; - while (i < buffer.length) - { - auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); - - if (p > 127) - { - return false; - } - - i += cpp::encoding::Utf16::getByteCount(p); - } - - return true; - } - - String toAsciiString(const View& buffer) - { - auto bytes = buffer.length / sizeof(char16_t); - auto chars = View(hx::InternalNew(bytes + 1, false), bytes * sizeof(char)); - auto i = int64_t{ 0 }; - auto k = int64_t{ 0 }; - - while (i < buffer.length) - { - auto p = cpp::encoding::Utf16::codepoint(buffer.slice(i)); - - chars[k++] = static_cast(p); - - i += cpp::encoding::Utf16::getByteCount(p); - } - - return String(chars.ptr.ptr, chars.length); - } -} - -bool cpp::encoding::Utf16::isEncoded(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - return string.isUTF16Encoded(); -} - -int cpp::encoding::Utf16::getByteCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) -{ - return codepoint <= 0xFFFF ? 2 : 4; -} - -int64_t cpp::encoding::Utf16::getByteCount(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (string.isUTF16Encoded()) - { - return string.length * sizeof(char16_t); - } - else - { - auto bytes = int64_t{ 0 }; - for (auto i = 0; i < string.length; i++) - { - bytes += getByteCount(static_cast(string.raw_ptr()[i])); - } - - return bytes; - } -} - -int cpp::encoding::Utf16::getCharCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) -{ - return getByteCount(codepoint) / sizeof(char16_t); -} - -int64_t cpp::encoding::Utf16::getCharCount(const String& string) -{ - return getByteCount(string) / sizeof(char16_t); -} - -int cpp::encoding::Utf16::encode(const null&, const cpp::marshal::View& buffer) -{ - hx::NullReference("String", false); - return 0; -} - -int64_t cpp::encoding::Utf16::encode(const String& string, const cpp::marshal::View& buffer) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (0 == string.length) - { - return 0; - } - - if (buffer.isEmpty()) - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - -#if defined(HX_SMART_STRINGS) - if (string.isUTF16Encoded()) - { - auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_wptr())), string.length * sizeof(char16_t)); - - if (src.tryCopyTo(buffer)) - { - return src.length; - } - else - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - } - else -#endif - { - auto bytes = int64_t{ 0 }; - for (auto i = 0; i < string.length; i++) - { - bytes += getByteCount(static_cast(string.raw_ptr()[i])); - } - - if (bytes > buffer.length) - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - - auto i = int64_t{ 0 }; - for (auto k = 0; k < string.length; k++) - { - i += encode(static_cast(string.raw_ptr()[k]), buffer.slice(i)); - } - - return bytes; - } -} - -int cpp::encoding::Utf16::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) -{ - if (codepoint < 0xD800) - { - Marshal::writeUInt16(buffer, static_cast(codepoint)); - - return 2; - } - else if (codepoint < 0xE000) - { - // D800 - DFFF is invalid - - return hx::Throw(HX_CSTRING("Invalid UTF16")); - } - else if (codepoint < 0x10000) - { - Marshal::writeUInt16(buffer, static_cast(codepoint)); - - return 2; - } - else if (codepoint < 0x110000) - { - auto staging = std::array(); - auto fst = View(staging.data(), 2); - auto snd = View(staging.data() + 2, 2); - auto all = View(staging.data(), staging.size()); - - Marshal::writeUInt16(fst, 0xD800 + (((codepoint - 0x10000) >> 10) & 0x3FF)); - Marshal::writeUInt16(snd, 0xDC00 + ((codepoint - 0x10000) & 0x3FF)); - - all.copyTo(buffer); - - return 4; - } - - return 0; -} - -String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) -{ - if (buffer.isEmpty()) - { - return String::emptyString; - } - - if (isAsciiBuffer(buffer)) - { - return toAsciiString(buffer); - } - -#if defined(HX_SMART_STRINGS) - auto i = int64_t{ 0 }; - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - } - - auto chars = i / sizeof(char16_t); - auto backing = View(::String::allocChar16Ptr(chars), chars); - auto output = backing.reinterpret(); - auto k = int64_t{ 0 }; - - i = 0; - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - k += encode(p, output.slice(k)); - } - - return String(backing.ptr.ptr, chars); -#else - return hx::Throw(HX_CSTRING("Not Implemented : UTF16 decode when HX_SMART_STRINGS is not defined")); -#endif -} - -char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) -{ - auto first = static_cast(Marshal::readUInt16(buffer)); - - if (0xD800 <= first && first < 0xDc00) - { - auto second = static_cast(Marshal::readUInt16(buffer.slice(2))); - if (0xDC00 <= second && second < 0xE000) - { - return static_cast((((first - 0xD800) << 10) | (second - 0xDC00)) + 0x10000); - } - - return int{ hx::Throw(HX_CSTRING("Invalid UTF16")) }; - } - else - { - return static_cast(first); - } -} diff --git a/src/cpp/encoding/Utf8.cpp b/src/cpp/encoding/Utf8.cpp deleted file mode 100644 index 6ff51af96..000000000 --- a/src/cpp/encoding/Utf8.cpp +++ /dev/null @@ -1,303 +0,0 @@ -#include -#include - -using namespace cpp::marshal; - -namespace -{ - bool isAsciiBuffer(const View& buffer) - { - auto i = int64_t{ 0 }; - while (i < buffer.length) - { - auto p = cpp::encoding::Utf8::codepoint(buffer.slice(i)); - - if (p > 127) - { - return false; - } - - i += cpp::encoding::Utf8::getByteCount(p); - } - - return true; - } -} - -int cpp::encoding::Utf8::getByteCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) -{ - if (codepoint <= 0x7F) - { - return 1; - } - else if (codepoint <= 0x7FF) - { - return 2; - } - else if (codepoint <= 0xFFFF) - { - return 3; - } - else - { - return 4; - } -} - -int64_t cpp::encoding::Utf8::getByteCount(const String& string) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (string.isAsciiEncoded()) - { - return string.length; - } - -#if defined(HX_SMART_STRINGS) - auto source = View(string.raw_wptr(), string.length).reinterpret(); - auto length = source.length; - auto bytes = int64_t{ 0 }; - auto i = int64_t{ 0 }; - - while (i < source.length) - { - auto slice = source.slice(i); - auto p = Utf16::codepoint(slice); - - i += Utf16::getByteCount(p); - bytes += getByteCount(p); - } - - return bytes; -#else - return hx::Throw(HX_CSTRING("Unexpected encoding error")); -#endif -} - -int cpp::encoding::Utf8::getCharCount(const null&) -{ - hx::NullReference("String", false); - return 0; -} - -int cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) -{ - return getByteCount(codepoint) / sizeof(char); -} - -int64_t cpp::encoding::Utf8::getCharCount(const String& string) -{ - return getByteCount(string) / sizeof(char); -} - -int cpp::encoding::Utf8::encode(const null&, const cpp::marshal::View& buffer) -{ - hx::NullReference("String", false); - return 0; -} - -int64_t cpp::encoding::Utf8::encode(const String& string, const cpp::marshal::View& buffer) -{ - if (null() == string) - { - hx::NullReference("String", false); - } - - if (0 == string.length) - { - return 0; - } - - if (buffer.isEmpty()) - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - - if (string.isAsciiEncoded()) - { - auto src = cpp::marshal::View(reinterpret_cast(const_cast(string.raw_ptr())), string.length); - - if (src.tryCopyTo(buffer)) - { - return src.length; - } - else - { - return hx::Throw(HX_CSTRING("Buffer too small")); - } - } - -#if defined(HX_SMART_STRINGS) - if (getByteCount(string) > buffer.length) - { - hx::Throw(HX_CSTRING("Buffer too small")); - } - - auto initialPtr = buffer.ptr.ptr; - auto source = View(string.raw_wptr(), string.length).reinterpret(); - auto i = int64_t{ 0 }; - auto k = int64_t{ 0 }; - - while (i < source.length) - { - auto p = Utf16::codepoint(source.slice(i)); - - i += Utf16::getByteCount(p); - k += encode(p, buffer.slice(k)); - } - - return k; -#else - return hx::Throw(HX_CSTRING("Unexpected encoding error")); -#endif -} - -int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) -{ - if (codepoint <= 0x7F) - { - buffer[0] = static_cast(codepoint); - - return 1; - } - else if (codepoint <= 0x7FF) - { - auto data = std::array - { { - static_cast(0xC0 | (codepoint >> 6)), - static_cast(0x80 | (codepoint & 63)) - } }; - auto src = View(data.data(), data.size()); - - src.copyTo(buffer); - - return data.size(); - } - else if (codepoint <= 0xFFFF) - { - auto data = std::array - { { - static_cast(0xE0 | (codepoint >> 12)), - static_cast(0x80 | ((codepoint >> 6) & 63)), - static_cast(0x80 | (codepoint & 63)) - } }; - - auto src = View(data.data(), data.size()); - - src.copyTo(buffer); - - return data.size(); - } - else - { - auto data = std::array - { { - static_cast(0xF0 | (codepoint >> 18)), - static_cast(0x80 | ((codepoint >> 12) & 63)), - static_cast(0x80 | ((codepoint >> 6) & 63)), - static_cast(0x80 | (codepoint & 63)) - } }; - - auto src = View(data.data(), data.size()); - - src.copyTo(buffer); - - return data.size(); - } -} - -String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) -{ - if (buffer.isEmpty()) - { - return String::emptyString; - } - - if (isAsciiBuffer(buffer)) - { - return Ascii::decode(buffer); - } - -#if defined(HX_SMART_STRINGS) - auto chars = int64_t{ 0 }; - auto i = int64_t{ 0 }; - - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - chars += Utf16::getCharCount(p); - } - - auto backing = View(::String::allocChar16Ptr(chars), chars); - auto output = backing.reinterpret(); - auto k = int64_t{ 0 }; - - i = 0; - while (i < buffer.length) - { - auto p = codepoint(buffer.slice(i)); - - i += getByteCount(p); - k += Utf16::encode(p, output.slice(k)); - } - - return String(backing.ptr.ptr, chars); -#else - auto backing = View(hx::InternalNew(buffer.length, false), buffer.length); - - std::memcpy(backing.ptr.ptr, buffer.ptr.ptr, buffer.length); - - return String(backing.ptr.ptr, static_cast(buffer.length)); -#endif -} - -char32_t cpp::encoding::Utf8::codepoint(const cpp::marshal::View& buffer) -{ - auto b0 = static_cast(buffer[0]); - - if ((b0 & 0x80) == 0) - { - return b0; - } - else if ((b0 & 0xE0) == 0xC0) - { - return (static_cast(b0 & 0x1F) << 6) | static_cast(buffer.slice(1)[0] & 0x3F); - } - else if ((b0 & 0xF0) == 0xE0) - { - auto staging = std::array(); - auto dst = View(staging.data(), staging.size()); - - buffer.slice(1, staging.size()).copyTo(dst); - - return (static_cast(b0 & 0x0F) << 12) | (static_cast(staging[0] & 0x3F) << 6) | static_cast(staging[1] & 0x3F); - } - else if ((b0 & 0xF8) == 0xF0) - { - auto staging = std::array(); - auto dst = View(staging.data(), staging.size()); - - buffer.slice(1, staging.size()).copyTo(dst); - - return - (static_cast(b0 & 0x07) << 18) | - (static_cast(staging[0] & 0x3F) << 12) | - (static_cast(staging[1] & 0x3F) << 6) | - static_cast(staging[2] & 0x3F); - } - else - { - return int{ hx::Throw(HX_CSTRING("Failed to read codepoint")) }; - } -} diff --git a/toolchain/haxe-target.xml b/toolchain/haxe-target.xml index 11dd88576..d3ff3cf44 100644 --- a/toolchain/haxe-target.xml +++ b/toolchain/haxe-target.xml @@ -83,7 +83,7 @@ - + @@ -190,7 +190,7 @@ - + @@ -204,9 +204,7 @@ - - - + @@ -280,5 +278,3 @@ - - From 93cfd0e5a38c85efab642312861071e5ba32a35c Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 19:16:48 +0000 Subject: [PATCH 2/6] Avoid duplicate string iteration for ascii checks --- src/cpp/encoding/Encodings.cpp | 37 +++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index c3d30d314..98c206e97 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -285,23 +285,29 @@ String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) return String::emptyString; } - if (isAsciiUtf8Buffer(buffer)) - { - return Ascii::decode(buffer); - } - #if defined(HX_SMART_STRINGS) auto chars = int64_t{ 0 }; auto i = int64_t{ 0 }; + bool isAscii = true; while (i < buffer.length) { auto p = codepoint(buffer.slice(i)); + if (p > 127) + { + isAscii = false; + } + i += getByteCount(p); chars += Utf16::getCharCount(p); } + if (isAscii) + { + return Ascii::decode(buffer); + } + auto backing = View(::String::allocChar16Ptr(chars), chars); auto output = backing.reinterpret(); auto k = int64_t{ 0 }; @@ -317,6 +323,11 @@ String cpp::encoding::Utf8::decode(const cpp::marshal::View& buffer) return String(backing.ptr.ptr, chars); #else + if (isAsciiUtf8Buffer(buffer)) + { + return Ascii::decode(buffer); + } + auto backing = View(hx::InternalNew(buffer.length, false), buffer.length); std::memcpy(backing.ptr.ptr, buffer.ptr.ptr, buffer.length); @@ -586,20 +597,26 @@ String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) return String::emptyString; } - if (isAsciiUtf16Buffer(buffer)) - { - return toAsciiString(buffer); - } - #if defined(HX_SMART_STRINGS) auto i = int64_t{ 0 }; + bool isAscii = true; while (i < buffer.length) { auto p = codepoint(buffer.slice(i)); + if (p > 127) + { + isAscii = false; + } + i += getByteCount(p); } + if (isAscii) + { + return toAsciiString(buffer); + } + auto chars = i / sizeof(char16_t); auto backing = View(::String::allocChar16Ptr(chars), chars); auto output = backing.reinterpret(); From 6271fdaeb184b78d8f665ce0be19f087a02f36e9 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 18:54:23 +0000 Subject: [PATCH 3/6] Add utf8 encode function that allocates output The existing Utf8::encode function takes in a buffer, but we don't know what size is required so we have to iterate through the string before writing to make sure the buffer is big enough. If the caller already ran getByteCount, then this means we have duplicated their work just for the case where they did not do it properly. This new method allocates its own buffer that is definitely the right size, which avoids the need for unnecessary checks --- include/cpp/encoding/Utf8.hpp | 1 + src/cpp/encoding/Encodings.cpp | 47 ++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/include/cpp/encoding/Utf8.hpp b/include/cpp/encoding/Utf8.hpp index 9636b8b51..8eeebead5 100644 --- a/include/cpp/encoding/Utf8.hpp +++ b/include/cpp/encoding/Utf8.hpp @@ -17,6 +17,7 @@ namespace cpp static int encode(const null&, const cpp::marshal::View& buffer); static int encode(const char32_t& codepoint, const cpp::marshal::View& buffer); static int64_t encode(const String& string, const cpp::marshal::View& buffer); + static Array encode(const String& string); static char32_t codepoint(const cpp::marshal::View& buffer); static String decode(const cpp::marshal::View& buffer); diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 98c206e97..59228194f 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -224,6 +224,53 @@ int64_t cpp::encoding::Utf8::encode(const String& string, const cpp::marshal::Vi #endif } +Array cpp::encoding::Utf8::encode(const String& string) +{ + if (null() == string) + { + hx::NullReference("String", false); + } + + if (0 == string.length) + { + return 0; + } + + if (string.isAsciiEncoded()) + { + Array out(string.length, 0); + + View src(reinterpret_cast(const_cast(string.raw_ptr())), string.length); + View buffer(out->Pointer(), out->length); + + src.copyTo(buffer); + + return out; + } + +#if defined(HX_SMART_STRINGS) + Array out(getByteCount(string), 0); + View buffer(out->Pointer(), out->length); + + auto initialPtr = buffer.ptr.ptr; + auto source = View(string.raw_wptr(), string.length).reinterpret(); + auto i = int64_t{ 0 }; + auto k = int64_t{ 0 }; + + while (i < source.length) + { + auto p = Utf16::codepoint(source.slice(i)); + + i += Utf16::getByteCount(p); + k += encode(p, buffer.slice(k)); + } + + return out; +#else + return hx::Throw(HX_CSTRING("Unexpected encoding error")); +#endif +} + int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) { if (codepoint <= 0x7F) From c696b23c56079b6f38cc05c4ec29e6a603836fb2 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 21:01:47 +0000 Subject: [PATCH 4/6] Pass char32_t by value instead of by reference --- include/cpp/encoding/Utf16.hpp | 6 +++--- include/cpp/encoding/Utf8.hpp | 6 +++--- src/cpp/encoding/Encodings.cpp | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/cpp/encoding/Utf16.hpp b/include/cpp/encoding/Utf16.hpp index ae2aacf31..6d851e14f 100644 --- a/include/cpp/encoding/Utf16.hpp +++ b/include/cpp/encoding/Utf16.hpp @@ -9,15 +9,15 @@ namespace cpp static bool isEncoded(const String& string); static int getByteCount(const null&); - static int getByteCount(const char32_t& codepoint); + static int getByteCount(char32_t codepoint); static int64_t getByteCount(const String& string); static int getCharCount(const null&); - static int getCharCount(const char32_t& codepoint); + static int getCharCount(char32_t codepoint); static int64_t getCharCount(const String& string); static int encode(const null&, const cpp::marshal::View& buffer); - static int encode(const char32_t& codepoint, const cpp::marshal::View& buffer); + static int encode(char32_t codepoint, const cpp::marshal::View& buffer); static int64_t encode(const String& string, const cpp::marshal::View& buffer); static char32_t codepoint(const cpp::marshal::View& buffer); diff --git a/include/cpp/encoding/Utf8.hpp b/include/cpp/encoding/Utf8.hpp index 8eeebead5..3ff12ba4b 100644 --- a/include/cpp/encoding/Utf8.hpp +++ b/include/cpp/encoding/Utf8.hpp @@ -7,15 +7,15 @@ namespace cpp struct Utf8 final { static int getByteCount(const null&); - static int getByteCount(const char32_t& codepoint); + static int getByteCount(char32_t codepoint); static int64_t getByteCount(const String& string); static int getCharCount(const null&); - static int getCharCount(const char32_t& codepoint); + static int getCharCount(char32_t codepoint); static int64_t getCharCount(const String& string); static int encode(const null&, const cpp::marshal::View& buffer); - static int encode(const char32_t& codepoint, const cpp::marshal::View& buffer); + static int encode(char32_t codepoint, const cpp::marshal::View& buffer); static int64_t encode(const String& string, const cpp::marshal::View& buffer); static Array encode(const String& string); diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 59228194f..472e77094 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -93,7 +93,7 @@ int cpp::encoding::Utf8::getByteCount(const null&) return 0; } -int cpp::encoding::Utf8::getByteCount(const char32_t& codepoint) +int cpp::encoding::Utf8::getByteCount(char32_t codepoint) { if (codepoint <= 0x7F) { @@ -152,7 +152,7 @@ int cpp::encoding::Utf8::getCharCount(const null&) return 0; } -int cpp::encoding::Utf8::getCharCount(const char32_t& codepoint) +int cpp::encoding::Utf8::getCharCount(char32_t codepoint) { return getByteCount(codepoint) / sizeof(char); } @@ -271,7 +271,7 @@ Array cpp::encoding::Utf8::encode(const String& string) #endif } -int cpp::encoding::Utf8::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +int cpp::encoding::Utf8::encode(char32_t codepoint, const cpp::marshal::View& buffer) { if (codepoint <= 0x7F) { @@ -494,7 +494,7 @@ int cpp::encoding::Utf16::getByteCount(const null&) return 0; } -int cpp::encoding::Utf16::getByteCount(const char32_t& codepoint) +int cpp::encoding::Utf16::getByteCount(char32_t codepoint) { return codepoint <= 0xFFFF ? 2 : 4; } @@ -528,7 +528,7 @@ int cpp::encoding::Utf16::getCharCount(const null&) return 0; } -int cpp::encoding::Utf16::getCharCount(const char32_t& codepoint) +int cpp::encoding::Utf16::getCharCount(char32_t codepoint) { return getByteCount(codepoint) / sizeof(char16_t); } @@ -599,7 +599,7 @@ int64_t cpp::encoding::Utf16::encode(const String& string, const cpp::marshal::V } } -int cpp::encoding::Utf16::encode(const char32_t& codepoint, const cpp::marshal::View& buffer) +int cpp::encoding::Utf16::encode(char32_t codepoint, const cpp::marshal::View& buffer) { if (codepoint < 0xD800) { From ecc950fd49bad10cb76f1272e3850eca57e8822a Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 23:03:40 +0000 Subject: [PATCH 5/6] Estimate utf8 length instead of iterating --- src/cpp/encoding/Encodings.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 472e77094..5abd4c84d 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -249,7 +249,8 @@ Array cpp::encoding::Utf8::encode(const String& string) } #if defined(HX_SMART_STRINGS) - Array out(getByteCount(string), 0); + // estimate the utf8 length with an upper bound + Array out(string.length * 3, 0); View buffer(out->Pointer(), out->length); auto initialPtr = buffer.ptr.ptr; @@ -265,6 +266,8 @@ Array cpp::encoding::Utf8::encode(const String& string) k += encode(p, buffer.slice(k)); } + out->resize(i); + return out; #else return hx::Throw(HX_CSTRING("Unexpected encoding error")); From df2734c9762637dbffab6cae4f49635499301b53 Mon Sep 17 00:00:00 2001 From: Tobiasz Laskowski Date: Thu, 5 Feb 2026 23:18:53 +0000 Subject: [PATCH 6/6] Make Utf16::codepoint locally inline --- src/cpp/encoding/Encodings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/encoding/Encodings.cpp b/src/cpp/encoding/Encodings.cpp index 5abd4c84d..23c83c38c 100644 --- a/src/cpp/encoding/Encodings.cpp +++ b/src/cpp/encoding/Encodings.cpp @@ -687,7 +687,7 @@ String cpp::encoding::Utf16::decode(const cpp::marshal::View& buffer) #endif } -char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) +inline char32_t cpp::encoding::Utf16::codepoint(const cpp::marshal::View& buffer) { auto first = static_cast(Marshal::readUInt16(buffer));