From 9d2a1ef8a548acf9a3b29ba5a2c0943823ae7b13 Mon Sep 17 00:00:00 2001 From: Korsakov Vladislav Date: Fri, 25 Jul 2025 19:49:16 +0300 Subject: [PATCH 1/3] Add IsSpace and rewrite IsNameStartChar + tests --- tinyxml2.h | 26 +++++++++++++++++--------- xmltest.cpp | 20 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/tinyxml2.h b/tinyxml2.h index c8011174..914da216 100644 --- a/tinyxml2.h +++ b/tinyxml2.h @@ -561,22 +561,30 @@ class TINYXML2_LIB XMLUtil static char* SkipWhiteSpace( char* const p, int* curLineNumPtr ) { return const_cast( SkipWhiteSpace( const_cast(p), curLineNumPtr ) ); } + inline static bool IsSpace( unsigned char ch ) { + static constexpr uint64_t mask = + 1ULL << 9 + | 1ULL << 10 + | 1ULL << 11 + | 1ULL << 12 + | 1ULL << 13 + | 1ULL << 32; + if ( ch > 32 ) { + return false; + } + return ( mask >> ( ch & 63 ) ) & 1; + } // Anything in the high order range of UTF-8 is assumed to not be whitespace. This isn't // correct, but simple, and usually works. static bool IsWhiteSpace( char p ) { - return !IsUTF8Continuation(p) && isspace( static_cast(p) ); + return !IsUTF8Continuation(p) && IsSpace( static_cast(p) ); } + // The method checks the char for matching ':', '_', alphabetic symbols and char >= 128 by bit mask inline static bool IsNameStartChar( unsigned char ch ) { - if ( ch >= 128 ) { - // This is a heuristic guess in attempt to not implement Unicode-aware isalpha() - return true; - } - if ( isalpha( ch ) ) { - return true; - } - return ch == ':' || ch == '_'; + static constexpr uint64_t mask[4] = { 1ULL << 58 , 1ULL << 31 | 0x07FFFFFE07FFFFFE , ~0ULL, ~0ULL}; + return ( mask[ch >> 6] >> ( ch & 63 ) ) & 1; } inline static bool IsNameChar( unsigned char ch ) { diff --git a/xmltest.cpp b/xmltest.cpp index 75d4babf..56aa0560 100644 --- a/xmltest.cpp +++ b/xmltest.cpp @@ -2729,6 +2729,26 @@ int main( int argc, const char ** argv ) } } } + + // ---------- Testing IsNameStartChar ---------- + { + + XMLUtil test; + XMLTest("IsNameStartChar(':')", true, test.IsNameStartChar(':')); + XMLTest("IsNameStartChar('_')", true, test.IsNameStartChar('_')); + XMLTest("IsNameStartChar('@')", false, test.IsNameStartChar('@')); + XMLTest("IsNameStartChar('A')", true, test.IsNameStartChar('A')); + XMLTest("IsNameStartChar('Z')", true, test.IsNameStartChar('Z')); + XMLTest("IsNameStartChar('[')", false, test.IsNameStartChar('[')); + XMLTest("IsNameStartChar('`')", false, test.IsNameStartChar('`')); + XMLTest("IsNameStartChar('a')", true, test.IsNameStartChar('a')); + XMLTest("IsNameStartChar('z')", true, test.IsNameStartChar('z')); + XMLTest("IsNameStartChar('{')", false, test.IsNameStartChar('{')); + XMLTest("IsNameStartChar(127)", false, test.IsNameStartChar(static_cast(127))); + XMLTest("IsNameStartChar(128)", true, test.IsNameStartChar(static_cast(128))); + XMLTest("IsNameStartChar(255)", true, test.IsNameStartChar(static_cast(255))); + } + // ----------- Performance tracking -------------- { From 0210bb7e3b045eaea5f9891a86073715642d06da Mon Sep 17 00:00:00 2001 From: Korsakov Vladislav Date: Sat, 26 Jul 2025 10:25:39 +0300 Subject: [PATCH 2/3] Add comments for test --- xmltest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xmltest.cpp b/xmltest.cpp index 56aa0560..41a06a24 100644 --- a/xmltest.cpp +++ b/xmltest.cpp @@ -2732,8 +2732,8 @@ int main( int argc, const char ** argv ) // ---------- Testing IsNameStartChar ---------- { - XMLUtil test; + // Tests validate key edge cases for IsNameStartChar without exhaustive coverage XMLTest("IsNameStartChar(':')", true, test.IsNameStartChar(':')); XMLTest("IsNameStartChar('_')", true, test.IsNameStartChar('_')); XMLTest("IsNameStartChar('@')", false, test.IsNameStartChar('@')); From 3b4e446e6050e9a670aaeacde06428ebe94cd7f7 Mon Sep 17 00:00:00 2001 From: Korsakov Vladislav Date: Fri, 1 Aug 2025 15:10:38 +0300 Subject: [PATCH 3/3] Simplify IsNameStartChar, IsSpace & IsWhiteSpace a little --- tinyxml2.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tinyxml2.h b/tinyxml2.h index 914da216..e70b97d6 100644 --- a/tinyxml2.h +++ b/tinyxml2.h @@ -572,16 +572,14 @@ class TINYXML2_LIB XMLUtil if ( ch > 32 ) { return false; } - return ( mask >> ( ch & 63 ) ) & 1; + return mask >> ch & 1; } - // Anything in the high order range of UTF-8 is assumed to not be whitespace. This isn't - // correct, but simple, and usually works. static bool IsWhiteSpace( char p ) { - return !IsUTF8Continuation(p) && IsSpace( static_cast(p) ); + return IsSpace( static_cast(p) ); } - // The method checks the char for matching ':', '_', alphabetic symbols and char >= 128 by bit mask + // The method checks a char for matching ':', '_', alphabetic symbols or char >= 128 by bit mask inline static bool IsNameStartChar( unsigned char ch ) { static constexpr uint64_t mask[4] = { 1ULL << 58 , 1ULL << 31 | 0x07FFFFFE07FFFFFE , ~0ULL, ~0ULL}; return ( mask[ch >> 6] >> ( ch & 63 ) ) & 1;