diff --git a/config.cmake.in b/config.cmake.in index 576ed78de7..4fbda765ec 100644 --- a/config.cmake.in +++ b/config.cmake.in @@ -73,6 +73,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) elseif(component STREQUAL "email") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_dns.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_email.cmake") elseif(component STREQUAL "uri") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_io.cmake") diff --git a/src/core/email/CMakeLists.txt b/src/core/email/CMakeLists.txt index 7330454c26..22828bcf36 100644 --- a/src/core/email/CMakeLists.txt +++ b/src/core/email/CMakeLists.txt @@ -1,5 +1,5 @@ sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME email - SOURCES email.cc) + SOURCES email.cc helpers.h) if(SOURCEMETA_CORE_INSTALL) sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME email) @@ -9,3 +9,5 @@ target_link_libraries(sourcemeta_core_email PRIVATE sourcemeta::core::dns) target_link_libraries(sourcemeta_core_email PRIVATE sourcemeta::core::ip) +target_link_libraries(sourcemeta_core_email + PRIVATE sourcemeta::core::unicode) diff --git a/src/core/email/email.cc b/src/core/email/email.cc index 12a15472f7..c3eb899cfa 100644 --- a/src/core/email/email.cc +++ b/src/core/email/email.cc @@ -1,110 +1,15 @@ #include #include -#include -namespace sourcemeta::core { - -// RFC 5321 §4.1.2: atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / -// "&" / "'" / "*" / "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / -// "{" / "|" / "}" / "~" -static constexpr auto is_atext(const char character) -> bool { - switch (character) { - case '!': - case '#': - case '$': - case '%': - case '&': - case '\'': - case '*': - case '+': - case '-': - case '/': - case '=': - case '?': - case '^': - case '_': - case '`': - case '{': - case '|': - case '}': - case '~': - return true; - default: - return (character >= 'A' && character <= 'Z') || - (character >= 'a' && character <= 'z') || - (character >= '0' && character <= '9'); - } -} - -// RFC 5321 §4.1.2: qtextSMTP = %d32-33 / %d35-91 / %d93-126 -static constexpr auto is_qtext_smtp(const unsigned char character) -> bool { - return (character >= 32 && character <= 33) || - (character >= 35 && character <= 91) || - (character >= 93 && character <= 126); -} - -// RFC 5321 §4.1.2: Let-dig = ALPHA / DIGIT -static constexpr auto is_let_dig(const char character) -> bool { - return (character >= 'A' && character <= 'Z') || - (character >= 'a' && character <= 'z') || - (character >= '0' && character <= '9'); -} - -// RFC 5321 §4.1.3: dcontent = %d33-90 / %d94-126 -static constexpr auto is_dcontent(const unsigned char character) -> bool { - return (character >= 33 && character <= 90) || - (character >= 94 && character <= 126); -} - -// RFC 5321 §4.1.2: Ldh-str = *( ALPHA / DIGIT / "-" ) Let-dig -// RFC 5321 §4.1.3: Standardized-tag = Ldh-str -static constexpr auto is_ldh_str(const std::string_view value) -> bool { - if (value.empty() || !is_let_dig(value.back())) { - return false; - } - for (std::string_view::size_type position{0}; position + 1 < value.size(); - position += 1) { - const auto character{value[position]}; - if (!is_let_dig(character) && character != '-') { - return false; - } - } - return true; -} - -// RFC 5234 §2.3: ABNF literal strings are case-insensitive by default -// RFC 5321 §4.1.3: IPv6-address-literal prefix is the literal "IPv6:" -static constexpr auto matches_ipv6_tag(const std::string_view value) -> bool { - return value.size() >= 5 && (value[0] == 'I' || value[0] == 'i') && - (value[1] == 'P' || value[1] == 'p') && - (value[2] == 'v' || value[2] == 'V') && value[3] == '6' && - value[4] == ':'; -} +#include "helpers.h" -// RFC 5321 §4.1.3: General-address-literal = Standardized-tag ":" 1*dcontent -static constexpr auto is_general_address_literal(const std::string_view value) - -> bool { - const auto colon_position{value.find(':')}; - if (colon_position == std::string_view::npos) { - return false; - } - if (!is_ldh_str(value.substr(0, colon_position))) { - return false; - } - const auto content{value.substr(colon_position + 1)}; - if (content.empty()) { - return false; - } - for (const auto character : content) { - if (!is_dcontent(static_cast(character))) { - return false; - } - } - return true; -} +namespace sourcemeta::core { -auto is_email(const std::string_view value) -> bool { +// RFC 5321 §4.1.2 Mailbox grammar. When AllowUtf8 is true, RFC 6531 §3.3 +// extends atext, qtextSMTP, and sub-domain with UTF8-non-ascii alternatives +template +static auto is_mailbox(const std::string_view value) -> bool { if (value.empty()) { return false; } @@ -126,11 +31,23 @@ auto is_email(const std::string_view value) -> bool { return false; } position += 1; - } else { - if (!is_qtext_smtp(static_cast(value[position]))) { + continue; + } + + if (is_qtext_smtp(static_cast(value[position]))) { + position += 1; + continue; + } + + if constexpr (AllowUtf8) { + // RFC 6531 §3.3: qtextSMTP =/ UTF8-non-ascii + const auto utf8_length{utf8_codepoint_length(value, position)}; + if (utf8_length < 2) { return false; } - position += 1; + position += utf8_length; + } else { + return false; } } if (position >= value.size()) { @@ -150,13 +67,29 @@ auto is_email(const std::string_view value) -> bool { } previous_was_dot = true; atom_started = false; - } else if (is_atext(character)) { + position += 1; + continue; + } + + if (is_atext(character)) { previous_was_dot = false; atom_started = true; + position += 1; + continue; + } + + if constexpr (AllowUtf8) { + // RFC 6531 §3.3: atext =/ UTF8-non-ascii + const auto utf8_length{utf8_codepoint_length(value, position)}; + if (utf8_length < 2) { + return false; + } + previous_was_dot = false; + atom_started = true; + position += utf8_length; } else { return false; } - position += 1; } if (position == 0 || previous_was_dot) { return false; @@ -177,32 +110,26 @@ auto is_email(const std::string_view value) -> bool { // RFC 5321 §4.1.3: address-literal = "[" ( IPv4 / IPv6 / General ) "]" if (!domain.empty() && domain.front() == '[') { - if (domain.back() != ']') { - return false; - } - // RFC 5321 §4.5.3.1.2: 255-octet cap on a domain "name or number" - if (domain.size() > 255) { - return false; - } - const auto inner{domain.substr(1, domain.size() - 2)}; - // RFC 5321 §4.1.3: IPv6-address-literal = "IPv6:" IPv6-addr - if (matches_ipv6_tag(inner) && is_ipv6(inner.substr(5))) { - return true; - } - // RFC 5234 §3.2: ABNF alternatives are unordered. A failed IPv6 match - // falls through to IPv4 or General-address-literal. - // RFC 5321 §4.1.3: IPv4-address-literal = Snum 3("." Snum) has no ":", - // General-address-literal requires ":" - if (inner.find(':') == std::string_view::npos) { - return is_ipv4(inner); - } - return is_general_address_literal(inner); + return is_address_literal(domain); } - // RFC 5321 §4.1.2 Domain matches is_hostname (RFC 1123 §2.1) by - // grammar, by 63-octet label cap (RFC 1035 §2.3.4), and by - // 255-octet total cap (RFC 5321 §4.5.3.1.2) - return is_hostname(domain); + if constexpr (AllowUtf8) { + // RFC 6531 §3.3: sub-domain =/ U-label + return is_idn_domain(domain); + } else { + // RFC 5321 §4.1.2 Domain matches is_hostname (RFC 1123 §2.1) by + // grammar, by 63-octet label cap (RFC 1035 §2.3.4), and by + // 255-octet total cap (RFC 5321 §4.5.3.1.2) + return is_hostname(domain); + } +} + +auto is_email(const std::string_view value) -> bool { + return is_mailbox(value); +} + +auto is_idn_email(const std::string_view value) -> bool { + return is_mailbox(value); } } // namespace sourcemeta::core diff --git a/src/core/email/helpers.h b/src/core/email/helpers.h new file mode 100644 index 0000000000..5cc9b822bf --- /dev/null +++ b/src/core/email/helpers.h @@ -0,0 +1,200 @@ +#ifndef SOURCEMETA_CORE_EMAIL_HELPERS_H_ +#define SOURCEMETA_CORE_EMAIL_HELPERS_H_ + +#include +#include + +#include // std::string_view + +namespace { + +// RFC 5321 §4.1.2: atext = ALPHA / DIGIT / "!" / "#" / "$" / "%" / +// "&" / "'" / "*" / "+" / "-" / "/" / "=" / "?" / "^" / "_" / "`" / +// "{" / "|" / "}" / "~" +inline constexpr auto is_atext(const char character) -> bool { + switch (character) { + case '!': + case '#': + case '$': + case '%': + case '&': + case '\'': + case '*': + case '+': + case '-': + case '/': + case '=': + case '?': + case '^': + case '_': + case '`': + case '{': + case '|': + case '}': + case '~': + return true; + default: + return (character >= 'A' && character <= 'Z') || + (character >= 'a' && character <= 'z') || + (character >= '0' && character <= '9'); + } +} + +// RFC 5321 §4.1.2: qtextSMTP = %d32-33 / %d35-91 / %d93-126 +inline constexpr auto is_qtext_smtp(const unsigned char character) -> bool { + return (character >= 32 && character <= 33) || + (character >= 35 && character <= 91) || + (character >= 93 && character <= 126); +} + +// RFC 5321 §4.1.2: Let-dig = ALPHA / DIGIT +inline constexpr auto is_let_dig(const char character) -> bool { + return (character >= 'A' && character <= 'Z') || + (character >= 'a' && character <= 'z') || + (character >= '0' && character <= '9'); +} + +// RFC 5321 §4.1.3: dcontent = %d33-90 / %d94-126 +inline constexpr auto is_dcontent(const unsigned char character) -> bool { + return (character >= 33 && character <= 90) || + (character >= 94 && character <= 126); +} + +// RFC 5321 §4.1.2: Ldh-str = *( ALPHA / DIGIT / "-" ) Let-dig +// RFC 5321 §4.1.3: Standardized-tag = Ldh-str +inline constexpr auto is_ldh_str(const std::string_view value) -> bool { + if (value.empty() || !is_let_dig(value.back())) { + return false; + } + for (std::string_view::size_type position{0}; position + 1 < value.size(); + position += 1) { + const auto character{value[position]}; + if (!is_let_dig(character) && character != '-') { + return false; + } + } + return true; +} + +// RFC 5234 §2.3: ABNF literal strings are case-insensitive by default +// RFC 5321 §4.1.3: IPv6-address-literal prefix is the literal "IPv6:" +inline constexpr auto matches_ipv6_tag(const std::string_view value) -> bool { + return value.size() >= 5 && (value[0] == 'I' || value[0] == 'i') && + (value[1] == 'P' || value[1] == 'p') && + (value[2] == 'v' || value[2] == 'V') && value[3] == '6' && + value[4] == ':'; +} + +// RFC 5321 §4.1.3: General-address-literal = Standardized-tag ":" 1*dcontent +inline constexpr auto is_general_address_literal(const std::string_view value) + -> bool { + const auto colon_position{value.find(':')}; + if (colon_position == std::string_view::npos) { + return false; + } + if (!is_ldh_str(value.substr(0, colon_position))) { + return false; + } + const auto content{value.substr(colon_position + 1)}; + if (content.empty()) { + return false; + } + for (const auto character : content) { + if (!is_dcontent(static_cast(character))) { + return false; + } + } + return true; +} + +// RFC 5321 §4.1.3: validate the address-literal payload (between "[" and "]") +// as IPv6, IPv4, or General-address-literal. Always ASCII; no IDNA applies +inline auto is_address_literal(const std::string_view domain) -> bool { + if (domain.back() != ']') { + return false; + } + // RFC 5321 §4.5.3.1.2: 255-octet cap on a domain "name or number" + if (domain.size() > 255) { + return false; + } + const auto inner{domain.substr(1, domain.size() - 2)}; + // RFC 5321 §4.1.3: IPv6-address-literal = "IPv6:" IPv6-addr + if (matches_ipv6_tag(inner) && sourcemeta::core::is_ipv6(inner.substr(5))) { + return true; + } + // RFC 5234 §3.2: ABNF alternatives are unordered. A failed IPv6 match + // falls through to IPv4 or General-address-literal. + // RFC 5321 §4.1.3: IPv4-address-literal has no ":"; + // General-address-literal requires ":" + if (inner.find(':') == std::string_view::npos) { + return sourcemeta::core::is_ipv4(inner); + } + return is_general_address_literal(inner); +} + +// TODO: Move to src/core/dns + +// RFC 6531 §3.3: sub-domain =/ U-label +// Relaxed sub-domain grammar where each label is a non-empty sequence of +// LetDig / hyphen / UTF8-non-ascii bytes, with no leading or trailing hyphen, +// length limits per RFC 5321 §4.5.3.1.2 and RFC 1035 §2.3.4 +inline auto is_idn_domain(const std::string_view value) -> bool { + if (value.empty() || value.size() > 255) { + return false; + } + + std::string_view::size_type position{0}; + while (position < value.size()) { + const auto label_start{position}; + bool last_was_hyphen{false}; + bool label_has_content{false}; + + while (position < value.size() && value[position] != '.') { + const auto character{value[position]}; + if (character == '-') { + if (!label_has_content) { + return false; + } + last_was_hyphen = true; + position += 1; + label_has_content = true; + continue; + } + + if (is_let_dig(character)) { + last_was_hyphen = false; + position += 1; + label_has_content = true; + continue; + } + + const auto utf8_length{ + sourcemeta::core::utf8_codepoint_length(value, position)}; + if (utf8_length < 2) { + return false; + } + last_was_hyphen = false; + position += utf8_length; + label_has_content = true; + } + + const auto label_length{position - label_start}; + if (label_length == 0 || label_length > 63 || last_was_hyphen) { + return false; + } + + if (position < value.size()) { + position += 1; + if (position == value.size()) { + // RFC 5321 §4.1.2 Domain has no trailing dot + return false; + } + } + } + + return true; +} + +} // namespace + +#endif diff --git a/src/core/email/include/sourcemeta/core/email.h b/src/core/email/include/sourcemeta/core/email.h index d65b2847b1..947c1b939c 100644 --- a/src/core/email/include/sourcemeta/core/email.h +++ b/src/core/email/include/sourcemeta/core/email.h @@ -8,7 +8,7 @@ #include // std::string_view /// @defgroup email Email -/// @brief E-mail address validation per RFC 5321. +/// @brief E-mail address validation per RFC 5321 and RFC 6531. /// /// This functionality is included as follows: /// @@ -36,6 +36,26 @@ namespace sourcemeta::core { SOURCEMETA_CORE_EMAIL_EXPORT auto is_email(const std::string_view value) -> bool; +/// @ingroup email +/// Check whether the given string is a valid internationalized `Mailbox` +/// per RFC 6531 Section 3.3 (extended Mailbox address syntax). Beyond the +/// ASCII grammar accepted by `is_email`, the local-part atoms, quoted +/// content, and domain labels may also contain valid UTF-8 non-ASCII byte +/// sequences (RFC 6532 Section 3.1). For example: +/// +/// ```cpp +/// #include +/// +/// #include +/// +/// assert(sourcemeta::core::is_idn_email( +/// "\xec\x8b\xa4\xeb\xa1\x80@\xec\x8b\xa4\xeb\xa1\x80.\xed\x85\x8c\xec\x8a\xa4\xed\x8a\xb8")); +/// assert(sourcemeta::core::is_idn_email("joe.bloggs@example.com")); +/// assert(!sourcemeta::core::is_idn_email("2962")); +/// ``` +SOURCEMETA_CORE_EMAIL_EXPORT +auto is_idn_email(const std::string_view value) -> bool; + } // namespace sourcemeta::core #endif diff --git a/src/core/unicode/include/sourcemeta/core/unicode.h b/src/core/unicode/include/sourcemeta/core/unicode.h index c8845a08eb..687734fd67 100644 --- a/src/core/unicode/include/sourcemeta/core/unicode.h +++ b/src/core/unicode/include/sourcemeta/core/unicode.h @@ -5,6 +5,8 @@ #include #endif +#include // std::size_t +#include // std::uint8_t #include // std::istream #include // std::optional #include // std::ostream @@ -98,6 +100,196 @@ SOURCEMETA_CORE_UNICODE_EXPORT auto utf8_to_utf32(const std::string_view input) -> std::optional; +/// @ingroup unicode +/// Determine the byte length encoded by a UTF-8 lead byte. Returns 1 for an +/// ASCII byte (%x00-7F), 2 for a 2-byte lead (%xC2-DF), 3 for a 3-byte lead +/// (%xE0-EF), 4 for a 4-byte lead (%xF0-F4), or 0 for any other byte +/// (continuation byte, overlong %xC0/%xC1, or out-of-range %xF5-FF). +/// For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::utf8_lead_byte_size(0x41) == 1); +/// assert(sourcemeta::core::utf8_lead_byte_size(0xCE) == 2); +/// assert(sourcemeta::core::utf8_lead_byte_size(0xE4) == 3); +/// assert(sourcemeta::core::utf8_lead_byte_size(0xF0) == 4); +/// assert(sourcemeta::core::utf8_lead_byte_size(0x80) == 0); +/// ``` +inline constexpr auto utf8_lead_byte_size(const unsigned char byte) + -> std::uint8_t { + if (byte < 0x80) { + return 1; + } + if (byte >= 0xC2 && byte <= 0xDF) { + return 2; + } + if (byte >= 0xE0 && byte <= 0xEF) { + return 3; + } + if (byte >= 0xF0 && byte <= 0xF4) { + return 4; + } + return 0; +} + +/// @ingroup unicode +/// Check whether the given byte is a UTF-8 continuation byte (%x80-BF per +/// RFC 6532 Section 3.1). For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::is_utf8_continuation(0x80)); +/// assert(sourcemeta::core::is_utf8_continuation(0xBF)); +/// assert(!sourcemeta::core::is_utf8_continuation(0x7F)); +/// assert(!sourcemeta::core::is_utf8_continuation(0xC0)); +/// ``` +inline constexpr auto is_utf8_continuation(const unsigned char byte) -> bool { + return byte >= 0x80 && byte <= 0xBF; +} + +/// @ingroup unicode +/// Check whether the given codepoint is in the UTF-16 surrogate range +/// (U+D800 to U+DFFF), which is forbidden in scalar Unicode text. +/// For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::is_surrogate(0xD800)); +/// assert(sourcemeta::core::is_surrogate(0xDFFF)); +/// assert(!sourcemeta::core::is_surrogate(0xD7FF)); +/// assert(!sourcemeta::core::is_surrogate(0xE000)); +/// ``` +inline constexpr auto is_surrogate(const char32_t codepoint) -> bool { + return codepoint >= 0xD800 && codepoint <= 0xDFFF; +} + +/// @ingroup unicode +/// Check whether the given value is a valid Unicode codepoint: in the range +/// U+0000 to U+10FFFF, excluding the UTF-16 surrogate range (U+D800 to +/// U+DFFF). For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::is_valid_codepoint(0x0000)); +/// assert(sourcemeta::core::is_valid_codepoint(0x10FFFF)); +/// assert(!sourcemeta::core::is_valid_codepoint(0xD800)); +/// assert(!sourcemeta::core::is_valid_codepoint(0x110000)); +/// ``` +inline constexpr auto is_valid_codepoint(const char32_t codepoint) -> bool { + return codepoint <= 0x10FFFF && !is_surrogate(codepoint); +} + +/// @ingroup unicode +/// Determine the number of UTF-8 bytes that a codepoint encodes to per +/// RFC 3629: 1 byte for U+0000-U+007F, 2 bytes for U+0080-U+07FF, 3 bytes +/// for U+0800-U+FFFF, and 4 bytes for U+10000 and above. The caller is +/// responsible for ensuring the codepoint is in range. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::utf8_codepoint_byte_count(0x0041) == 1); +/// assert(sourcemeta::core::utf8_codepoint_byte_count(0x00E9) == 2); +/// assert(sourcemeta::core::utf8_codepoint_byte_count(0x4E2D) == 3); +/// assert(sourcemeta::core::utf8_codepoint_byte_count(0x1F600) == 4); +/// ``` +inline constexpr auto utf8_codepoint_byte_count(const char32_t codepoint) + -> std::uint8_t { + if (codepoint < 0x80) { + return 1; + } + if (codepoint < 0x800) { + return 2; + } + if (codepoint < 0x10000) { + return 3; + } + return 4; +} + +/// @ingroup unicode +/// Determine the byte length of the valid UTF-8 codepoint starting at the +/// given position within the input. Returns 1 for an ASCII byte, 2/3/4 for a +/// valid multi-byte UTF-8 sequence (RFC 6532 Section 3.1, excluding overlong +/// encodings, surrogates, and code points above U+10FFFF), or 0 if the bytes +/// at that position do not start a valid UTF-8 codepoint. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::utf8_codepoint_length("A", 0) == 1); +/// assert(sourcemeta::core::utf8_codepoint_length("\xce\xb1", 0) == 2); +/// assert(sourcemeta::core::utf8_codepoint_length("\xe4\xb8\xad", 0) == 3); +/// assert(sourcemeta::core::utf8_codepoint_length("\xf0\x9f\x98\x80", 0) == 4); +/// assert(sourcemeta::core::utf8_codepoint_length("\xed\xa0\x80", 0) == 0); +/// ``` +inline constexpr auto +utf8_codepoint_length(const std::string_view input, + const std::string_view::size_type position) + -> std::size_t { + if (position >= input.size()) { + return 0; + } + const auto byte_0{static_cast(input[position])}; + const auto size{utf8_lead_byte_size(byte_0)}; + if (size == 0 || position + size > input.size()) { + return 0; + } + if (size == 1) { + return 1; + } + + // The second byte after the lead has tighter sub-ranges for specific leads + // (RFC 6532 §3.1) that exclude overlong encodings, surrogates, and code + // points above U+10FFFF + const auto byte_1{static_cast(input[position + 1])}; + bool byte_1_ok{false}; + if (size == 2) { + byte_1_ok = is_utf8_continuation(byte_1); + } else if (size == 3) { + if (byte_0 == 0xE0) { + byte_1_ok = byte_1 >= 0xA0 && byte_1 <= 0xBF; + } else if (byte_0 == 0xED) { + byte_1_ok = byte_1 >= 0x80 && byte_1 <= 0x9F; + } else { + byte_1_ok = is_utf8_continuation(byte_1); + } + } else { + if (byte_0 == 0xF0) { + byte_1_ok = byte_1 >= 0x90 && byte_1 <= 0xBF; + } else if (byte_0 == 0xF4) { + byte_1_ok = byte_1 >= 0x80 && byte_1 <= 0x8F; + } else { + byte_1_ok = is_utf8_continuation(byte_1); + } + } + + if (!byte_1_ok) { + return 0; + } + + // Remaining continuation bytes (if any) are unconstrained beyond the + // continuation byte range + for (std::size_t index{2}; index < size; ++index) { + if (!is_utf8_continuation( + static_cast(input[position + index]))) { + return 0; + } + } + + return size; +} + } // namespace sourcemeta::core #endif diff --git a/src/core/unicode/unicode.cc b/src/core/unicode/unicode.cc index fa43ef1002..f85267c525 100644 --- a/src/core/unicode/unicode.cc +++ b/src/core/unicode/unicode.cc @@ -7,8 +7,7 @@ namespace sourcemeta::core { auto codepoint_to_utf8(const char32_t codepoint, std::ostream &output) -> void { - assert(codepoint <= 0x10FFFF); - assert(codepoint < 0xD800 || codepoint > 0xDFFF); + assert(is_valid_codepoint(codepoint)); if (codepoint < 0x80) { output.put(static_cast(codepoint)); } else if (codepoint < 0x800) { @@ -27,8 +26,7 @@ auto codepoint_to_utf8(const char32_t codepoint, std::ostream &output) -> void { } auto codepoint_to_utf8(const char32_t codepoint, std::string &output) -> void { - assert(codepoint <= 0x10FFFF); - assert(codepoint < 0xD800 || codepoint > 0xDFFF); + assert(is_valid_codepoint(codepoint)); if (codepoint < 0x80) { output.push_back(static_cast(codepoint)); } else if (codepoint < 0x800) { @@ -57,41 +55,39 @@ auto utf8_to_utf32(std::istream &input) -> std::optional { std::uint8_t byte{0}; while (input.read(reinterpret_cast(&byte), 1)) { - char32_t code_point{0}; - std::uint8_t continuation_count{0}; - char32_t minimum{0}; - - if (byte < 0x80) { + const auto size{utf8_lead_byte_size(byte)}; + if (size == 0) { + return std::nullopt; + } + if (size == 1) { result.push_back(byte); continue; - } else if ((byte & 0xE0) == 0xC0) { + } + + char32_t code_point{0}; + char32_t minimum{0}; + if (size == 2) { code_point = byte & 0x1F; - continuation_count = 1; minimum = 0x80; - } else if ((byte & 0xF0) == 0xE0) { + } else if (size == 3) { code_point = byte & 0x0F; - continuation_count = 2; minimum = 0x800; - } else if ((byte & 0xF8) == 0xF0) { + } else { code_point = byte & 0x07; - continuation_count = 3; minimum = 0x10000; - } else { - return std::nullopt; } - for (std::uint8_t index = 0; index < continuation_count; ++index) { + for (std::uint8_t index{1}; index < size; ++index) { std::uint8_t continuation{0}; if (!input.read(reinterpret_cast(&continuation), 1) || - (continuation & 0xC0) != 0x80) { + !is_utf8_continuation(continuation)) { return std::nullopt; } code_point = (code_point << 6) | (continuation & 0x3F); } - if (code_point < minimum || code_point > 0x10FFFF || - (code_point >= 0xD800 && code_point <= 0xDFFF)) { + if (code_point < minimum || !is_valid_codepoint(code_point)) { return std::nullopt; } diff --git a/test/email/CMakeLists.txt b/test/email/CMakeLists.txt index 52b5ebf0a8..b78bcac2d8 100644 --- a/test/email/CMakeLists.txt +++ b/test/email/CMakeLists.txt @@ -1,5 +1,5 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME email - SOURCES email_test.cc) + SOURCES email_test.cc idn_email_test.cc) target_link_libraries(sourcemeta_core_email_unit PRIVATE sourcemeta::core::email) diff --git a/test/email/email_test.cc b/test/email/email_test.cc index 161e9593b9..54c6b6fb87 100644 --- a/test/email/email_test.cc +++ b/test/email/email_test.cc @@ -7,321 +7,385 @@ // RFC 5321 §4.1.2: minimal Dot-string Atom + minimal Domain sub-domain TEST(Email, valid_dot_string_single_letter) { EXPECT_TRUE(sourcemeta::core::is_email("a@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@b")); } // RFC 5321 §4.1.2: atext includes DIGIT TEST(Email, valid_dot_string_single_digit) { EXPECT_TRUE(sourcemeta::core::is_email("1@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("1@b")); } // RFC 5321 §4.1.2: Atom = 1*atext TEST(Email, valid_dot_string_multi_letter_atom) { EXPECT_TRUE(sourcemeta::core::is_email("abc@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("abc@b")); } // RFC 5321 §4.1.2: atext mixes ALPHA and DIGIT TEST(Email, valid_dot_string_alpha_digit_mix) { EXPECT_TRUE(sourcemeta::core::is_email("a1b2@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a1b2@b")); } // RFC 5321 §4.1.2: Dot-string = Atom *("." Atom) with two atoms TEST(Email, valid_dot_string_two_atoms) { EXPECT_TRUE(sourcemeta::core::is_email("a.b@c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a.b@c")); } // RFC 5321 §4.1.2: Dot-string with many atoms TEST(Email, valid_dot_string_many_atoms) { EXPECT_TRUE(sourcemeta::core::is_email("a.b.c.d.e@f")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a.b.c.d.e@f")); } // RFC 5321 §4.1.2: ALPHA covers A-Z TEST(Email, valid_dot_string_uppercase_atom) { EXPECT_TRUE(sourcemeta::core::is_email("ABC@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("ABC@b")); } // RFC 5321 §4.1.2: ALPHA covers both cases in one atom TEST(Email, valid_dot_string_mixed_case_atom) { EXPECT_TRUE(sourcemeta::core::is_email("aBc@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("aBc@b")); } // RFC 5321 §4.1.2: "!" is atext TEST(Email, valid_atext_bang) { EXPECT_TRUE(sourcemeta::core::is_email("!@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("!@b")); } // RFC 5321 §4.1.2: "#" is atext TEST(Email, valid_atext_hash) { EXPECT_TRUE(sourcemeta::core::is_email("#@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("#@b")); } // RFC 5321 §4.1.2: "$" is atext TEST(Email, valid_atext_dollar) { EXPECT_TRUE(sourcemeta::core::is_email("$@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("$@b")); } // RFC 5321 §4.1.2: "%" is atext TEST(Email, valid_atext_percent) { EXPECT_TRUE(sourcemeta::core::is_email("%@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("%@b")); } // RFC 5321 §4.1.2: "&" is atext TEST(Email, valid_atext_ampersand) { EXPECT_TRUE(sourcemeta::core::is_email("&@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("&@b")); } // RFC 5321 §4.1.2: "'" is atext TEST(Email, valid_atext_apostrophe) { EXPECT_TRUE(sourcemeta::core::is_email("'@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("'@b")); } // RFC 5321 §4.1.2: "*" is atext TEST(Email, valid_atext_asterisk) { EXPECT_TRUE(sourcemeta::core::is_email("*@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("*@b")); } // RFC 5321 §4.1.2: "+" is atext TEST(Email, valid_atext_plus) { EXPECT_TRUE(sourcemeta::core::is_email("+@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("+@b")); } // RFC 5321 §4.1.2: "-" is atext TEST(Email, valid_atext_hyphen) { EXPECT_TRUE(sourcemeta::core::is_email("-@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("-@b")); } // RFC 5321 §4.1.2: "/" is atext TEST(Email, valid_atext_slash) { EXPECT_TRUE(sourcemeta::core::is_email("/@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("/@b")); } // RFC 5321 §4.1.2: "=" is atext TEST(Email, valid_atext_equals) { EXPECT_TRUE(sourcemeta::core::is_email("=@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("=@b")); } // RFC 5321 §4.1.2: "?" is atext TEST(Email, valid_atext_question) { EXPECT_TRUE(sourcemeta::core::is_email("?@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("?@b")); } // RFC 5321 §4.1.2: "^" is atext TEST(Email, valid_atext_caret) { EXPECT_TRUE(sourcemeta::core::is_email("^@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("^@b")); } // RFC 5321 §4.1.2: "_" is atext TEST(Email, valid_atext_underscore) { EXPECT_TRUE(sourcemeta::core::is_email("_@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("_@b")); } // RFC 5321 §4.1.2: "`" is atext TEST(Email, valid_atext_backtick) { EXPECT_TRUE(sourcemeta::core::is_email("`@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("`@b")); } // RFC 5321 §4.1.2: "{" is atext TEST(Email, valid_atext_lbrace) { EXPECT_TRUE(sourcemeta::core::is_email("{@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("{@b")); } // RFC 5321 §4.1.2: "|" is atext TEST(Email, valid_atext_pipe) { EXPECT_TRUE(sourcemeta::core::is_email("|@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("|@b")); } // RFC 5321 §4.1.2: "}" is atext TEST(Email, valid_atext_rbrace) { EXPECT_TRUE(sourcemeta::core::is_email("}@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("}@b")); } // RFC 5321 §4.1.2: "~" is atext TEST(Email, valid_atext_tilde) { EXPECT_TRUE(sourcemeta::core::is_email("~@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("~@b")); } // RFC 5321 §4.1.2: a single Atom may include every atext special at once TEST(Email, valid_dot_string_all_specials_one_atom) { EXPECT_TRUE(sourcemeta::core::is_email("!#$%&'*+-/=?^_`{|}~@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("!#$%&'*+-/=?^_`{|}~@b")); } // RFC 5321 §4.5.3.1.1: Local-part octet limit is 64 TEST(Email, valid_local_part_length_64) { EXPECT_TRUE(sourcemeta::core::is_email(std::string(64, 'a') + "@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email(std::string(64, 'a') + "@b")); } // RFC 5321 §4.1.2: Atom = 1*atext, single byte is the minimum TEST(Email, valid_local_part_length_1) { EXPECT_TRUE(sourcemeta::core::is_email("a@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@b")); } // RFC 5321 §4.1.2: Mailbox requires both a Local-part and a Domain TEST(Email, invalid_no_at_sign) { EXPECT_FALSE(sourcemeta::core::is_email("plain")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("plain")); } // RFC 5321 §4.1.2: Local-part = Dot-string / Quoted-string, both non-empty TEST(Email, invalid_only_at_sign) { EXPECT_FALSE(sourcemeta::core::is_email("@")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("@")); } // RFC 5321 §4.1.2: Atom = 1*atext, empty Local-part is invalid TEST(Email, invalid_empty_local) { EXPECT_FALSE(sourcemeta::core::is_email("@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("@b")); } // RFC 5321 §4.1.2: Domain = sub-domain *("." sub-domain), empty is invalid TEST(Email, invalid_empty_domain) { EXPECT_FALSE(sourcemeta::core::is_email("a@")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@")); } // RFC 5321 §4.1.2: only one "@" allowed outside a Quoted-string TEST(Email, invalid_two_at_signs_unquoted) { EXPECT_FALSE(sourcemeta::core::is_email("a@b@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b@c")); } // RFC 5321 §4.1.2: three unquoted "@" signs TEST(Email, invalid_three_at_signs_unquoted) { EXPECT_FALSE(sourcemeta::core::is_email("a@b@c@d")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b@c@d")); } // RFC 5321 §4.1.2: Dot-string requires a leading Atom TEST(Email, invalid_local_leading_dot) { EXPECT_FALSE(sourcemeta::core::is_email(".a@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(".a@b")); } // RFC 5321 §4.1.2: Dot-string requires a trailing Atom TEST(Email, invalid_local_trailing_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a.@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a.@b")); } // RFC 5321 §4.1.2: Atom = 1*atext, double dot yields an empty Atom TEST(Email, invalid_local_double_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a..b@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a..b@c")); } // RFC 5321 §4.1.2: lone "." has no atext on either side TEST(Email, invalid_local_only_dot) { EXPECT_FALSE(sourcemeta::core::is_email(".@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(".@b")); } // RFC 5321 §4.1.2: ".." has no atext between the dots TEST(Email, invalid_local_only_dots) { EXPECT_FALSE(sourcemeta::core::is_email("..@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("..@b")); } // RFC 5321 §4.5.3.1.1: Local-part may not exceed 64 octets TEST(Email, invalid_local_part_length_65) { EXPECT_FALSE(sourcemeta::core::is_email(std::string(65, 'a') + "@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(std::string(65, 'a') + "@b")); } // RFC 5321 §4.1.2: "(" is not in atext TEST(Email, invalid_atext_lparen) { EXPECT_FALSE(sourcemeta::core::is_email("(@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("(@b")); } // RFC 5321 §4.1.2: ")" is not in atext TEST(Email, invalid_atext_rparen) { EXPECT_FALSE(sourcemeta::core::is_email(")@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(")@b")); } // RFC 5321 §4.1.2: "," is not in atext TEST(Email, invalid_atext_comma) { EXPECT_FALSE(sourcemeta::core::is_email(",@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(",@b")); } // RFC 5321 §4.1.2: ":" is not in atext TEST(Email, invalid_atext_colon) { EXPECT_FALSE(sourcemeta::core::is_email(":@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(":@b")); } // RFC 5321 §4.1.2: ";" is not in atext TEST(Email, invalid_atext_semicolon) { EXPECT_FALSE(sourcemeta::core::is_email(";@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(";@b")); } // RFC 5321 §4.1.2: "<" is not in atext TEST(Email, invalid_atext_lt) { EXPECT_FALSE(sourcemeta::core::is_email("<@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("<@b")); } // RFC 5321 §4.1.2: ">" is not in atext TEST(Email, invalid_atext_gt) { EXPECT_FALSE(sourcemeta::core::is_email(">@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(">@b")); } // RFC 5321 §4.1.2: "[" is not in atext TEST(Email, invalid_atext_lbracket) { EXPECT_FALSE(sourcemeta::core::is_email("[@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("[@b")); } // RFC 5321 §4.1.2: "\" is not in atext (only valid inside quoted-pair) TEST(Email, invalid_atext_backslash) { EXPECT_FALSE(sourcemeta::core::is_email("\\@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\\@b")); } // RFC 5321 §4.1.2: "]" is not in atext TEST(Email, invalid_atext_rbracket) { EXPECT_FALSE(sourcemeta::core::is_email("]@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("]@b")); } // RFC 5321 §4.1.2: SP is not in atext TEST(Email, invalid_atext_space_unquoted) { EXPECT_FALSE(sourcemeta::core::is_email(" @b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(" @b")); } // RFC 5321 §4.1.2: atext is ASCII; bytes >= 0x80 are excluded TEST(Email, invalid_local_high_bit_byte) { EXPECT_FALSE(sourcemeta::core::is_email("a\x80@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a\x80@b")); } // RFC 5321 §4.1.2: NUL is not in atext TEST(Email, invalid_local_nul_byte) { EXPECT_FALSE(sourcemeta::core::is_email(std::string_view{"a\x00@b", 4})); + EXPECT_FALSE(sourcemeta::core::is_idn_email(std::string_view{"a\x00@b", 4})); } // RFC 5321 §4.1.2: control characters are not in atext TEST(Email, invalid_local_control_byte) { EXPECT_FALSE(sourcemeta::core::is_email("a\x01@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a\x01@b")); } // RFC 5321 §4.1.2: DEL is not in atext TEST(Email, invalid_local_del_byte) { EXPECT_FALSE(sourcemeta::core::is_email("a\x7f@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a\x7f@b")); } // RFC 5321 §4.1.2 Domain: sub-domain *("." sub-domain) with one label TEST(Email, valid_domain_single_label) { EXPECT_TRUE(sourcemeta::core::is_email("a@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@b")); } // RFC 5321 §4.1.2 Domain: two labels separated by "." TEST(Email, valid_domain_two_labels) { EXPECT_TRUE(sourcemeta::core::is_email("a@b.c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@b.c")); } // RFC 5321 §4.1.2 Domain: many labels TEST(Email, valid_domain_three_labels) { EXPECT_TRUE(sourcemeta::core::is_email("a@b.c.d")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@b.c.d")); } // RFC 5321 §4.1.2 Domain: sub-domain = Let-dig [Ldh-str], digit is Let-dig TEST(Email, valid_domain_label_starts_with_digit) { EXPECT_TRUE(sourcemeta::core::is_email("a@1b.c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@1b.c")); } // RFC 5321 §4.1.2 Domain: grammar allows numeric-only labels TEST(Email, valid_domain_numeric_tld) { EXPECT_TRUE(sourcemeta::core::is_email("a@example.123")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@example.123")); } // RFC 5321 §4.1.2 Domain: case is preserved but accepted TEST(Email, valid_domain_uppercase) { EXPECT_TRUE(sourcemeta::core::is_email("a@EXAMPLE.COM")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@EXAMPLE.COM")); } // RFC 1035 §2.3.4 via §4.1.2 Domain: 63-byte label is the cap TEST(Email, valid_domain_label_63) { EXPECT_TRUE(sourcemeta::core::is_email("a@" + std::string(63, 'b'))); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@" + std::string(63, 'b'))); } // RFC 5321 §4.5.3.1.2 Domain: 255-byte domain is the cap @@ -329,41 +393,51 @@ TEST(Email, valid_domain_total_255) { EXPECT_TRUE(sourcemeta::core::is_email( "a@" + std::string(63, 'b') + "." + std::string(63, 'c') + "." + std::string(63, 'd') + "." + std::string(63, 'e'))); + EXPECT_TRUE(sourcemeta::core::is_idn_email( + "a@" + std::string(63, 'b') + "." + std::string(63, 'c') + "." + + std::string(63, 'd') + "." + std::string(63, 'e'))); } // RFC 5321 §4.1.2 Domain: Ldh-str excludes "_" TEST(Email, invalid_domain_underscore) { EXPECT_FALSE(sourcemeta::core::is_email("a@host_name")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@host_name")); } // RFC 5321 §4.1.2 Domain: sub-domain must start with Let-dig TEST(Email, invalid_domain_leading_hyphen) { EXPECT_FALSE(sourcemeta::core::is_email("a@-host")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@-host")); } // RFC 5321 §4.1.2 Domain: Ldh-str must end with Let-dig TEST(Email, invalid_domain_trailing_hyphen) { EXPECT_FALSE(sourcemeta::core::is_email("a@host-")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@host-")); } // RFC 5321 §4.1.2 Domain: no trailing "." TEST(Email, invalid_domain_trailing_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a@host.")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@host.")); } // RFC 5321 §4.1.2 Domain: "." between labels requires a sub-domain on each side TEST(Email, invalid_domain_double_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a@x..y")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@x..y")); } // RFC 5321 §4.1.2 Domain: no leading "." TEST(Email, invalid_domain_leading_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a@.host")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@.host")); } // RFC 1035 §2.3.4 via §4.1.2 Domain: 64-byte label exceeds cap TEST(Email, invalid_domain_label_64) { EXPECT_FALSE(sourcemeta::core::is_email("a@" + std::string(64, 'b'))); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@" + std::string(64, 'b'))); } // RFC 5321 §4.5.3.1.2 Domain: 256-byte domain exceeds cap (63 + "." + 63 + @@ -372,195 +446,237 @@ TEST(Email, invalid_domain_total_256) { EXPECT_FALSE(sourcemeta::core::is_email( "a@" + std::string(63, 'b') + "." + std::string(63, 'c') + "." + std::string(63, 'd') + "." + std::string(62, 'e') + ".f")); + EXPECT_FALSE(sourcemeta::core::is_idn_email( + "a@" + std::string(63, 'b') + "." + std::string(63, 'c') + "." + + std::string(63, 'd') + "." + std::string(62, 'e') + ".f")); } // RFC 5321 §4.1.2 Domain: ASCII only, bytes >= 0x80 excluded TEST(Email, invalid_domain_high_bit_byte) { EXPECT_FALSE(sourcemeta::core::is_email("a@hos\x80t")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@hos\x80t")); } // RFC 5321 §4.1.2 Domain: SP is not in Ldh-str TEST(Email, invalid_domain_space) { EXPECT_FALSE(sourcemeta::core::is_email("a@host name")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@host name")); } // RFC 5321 §4.1.2: Quoted-string = DQUOTE *QcontentSMTP DQUOTE permits zero // content bytes TEST(Email, valid_quoted_empty) { EXPECT_TRUE(sourcemeta::core::is_email("\"\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\"@b")); } // RFC 5321 §4.1.2: qtextSMTP includes %d32 SP TEST(Email, valid_quoted_single_space) { EXPECT_TRUE(sourcemeta::core::is_email("\" \"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\" \"@b")); } // RFC 5321 §4.1.2: any atext byte is also in qtextSMTP TEST(Email, valid_quoted_single_atext) { EXPECT_TRUE(sourcemeta::core::is_email("\"a\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a\"@b")); } // RFC 5321 §4.1.2: qtextSMTP %d35-91 includes "@" TEST(Email, valid_quoted_at_inside) { EXPECT_TRUE(sourcemeta::core::is_email("\"a@b\"@c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a@b\"@c")); } // RFC 5321 §4.1.2: Dot-string rules do not apply inside Quoted-string TEST(Email, valid_quoted_dot_inside) { EXPECT_TRUE(sourcemeta::core::is_email("\"a.b\"@c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a.b\"@c")); } // RFC 5321 §4.1.2: Quoted-string permits a leading "." inside TEST(Email, valid_quoted_dot_at_start) { EXPECT_TRUE(sourcemeta::core::is_email("\".a\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\".a\"@b")); } // RFC 5321 §4.1.2: Quoted-string permits a trailing "." inside TEST(Email, valid_quoted_dot_at_end) { EXPECT_TRUE(sourcemeta::core::is_email("\"a.\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a.\"@b")); } // RFC 5321 §4.1.2: Quoted-string permits consecutive "." inside TEST(Email, valid_quoted_double_dot_inside) { EXPECT_TRUE(sourcemeta::core::is_email("\"a..b\"@c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a..b\"@c")); } // RFC 5321 §4.1.2: qtextSMTP starts at %d32 (SP) TEST(Email, valid_quoted_qtext_d32_space) { EXPECT_TRUE(sourcemeta::core::is_email("\" \"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\" \"@b")); } // RFC 5321 §4.1.2: qtextSMTP includes %d33 "!" TEST(Email, valid_quoted_qtext_d33_bang) { EXPECT_TRUE(sourcemeta::core::is_email("\"!\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"!\"@b")); } // RFC 5321 §4.1.2: qtextSMTP resumes at %d35 "#" after skipping DQUOTE TEST(Email, valid_quoted_qtext_d35_hash) { EXPECT_TRUE(sourcemeta::core::is_email("\"#\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"#\"@b")); } // RFC 5321 §4.1.2: qtextSMTP %d35-91 ends at "[" (%d91) TEST(Email, valid_quoted_qtext_d91_lbracket) { EXPECT_TRUE(sourcemeta::core::is_email("\"[\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"[\"@b")); } // RFC 5321 §4.1.2: qtextSMTP resumes at "]" (%d93) after skipping "\\" TEST(Email, valid_quoted_qtext_d93_rbracket) { EXPECT_TRUE(sourcemeta::core::is_email("\"]\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"]\"@b")); } // RFC 5321 §4.1.2: qtextSMTP ends at "~" (%d126) TEST(Email, valid_quoted_qtext_d126_tilde) { EXPECT_TRUE(sourcemeta::core::is_email("\"~\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"~\"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP escapes DQUOTE TEST(Email, valid_quoted_pair_dquote) { EXPECT_TRUE(sourcemeta::core::is_email("\"\\\"\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\\\"\"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP escapes backslash itself TEST(Email, valid_quoted_pair_backslash) { EXPECT_TRUE(sourcemeta::core::is_email("\"\\\\\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\\\\\"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP body %d32 is SP TEST(Email, valid_quoted_pair_space) { EXPECT_TRUE(sourcemeta::core::is_email("\"\\ \"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\\ \"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP body covers any ASCII graphic TEST(Email, valid_quoted_pair_letter) { EXPECT_TRUE(sourcemeta::core::is_email("\"\\a\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\\a\"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP body %d126 is "~" TEST(Email, valid_quoted_pair_tilde) { EXPECT_TRUE(sourcemeta::core::is_email("\"\\~\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\\~\"@b")); } // RFC 5321 §4.1.2: Quoted-string accepts qtextSMTP and quoted-pairSMTP mixed TEST(Email, valid_quoted_mixed_qtext_and_pair) { EXPECT_TRUE(sourcemeta::core::is_email("\"a\\\"b\\\\c\"@d")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a\\\"b\\\\c\"@d")); } // RFC 5321 §4.1.2: qtextSMTP includes ',' and ';' (both in %d35-91) TEST(Email, valid_quoted_with_comma_semicolon) { EXPECT_TRUE(sourcemeta::core::is_email("\"a,b;c\"@d")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a,b;c\"@d")); } // RFC 5321 §4.1.2: qtextSMTP includes '(' and ')' (both in %d35-91) TEST(Email, valid_quoted_with_parens) { EXPECT_TRUE(sourcemeta::core::is_email("\"(comment)\"@d")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"(comment)\"@d")); } // RFC 5321 §4.5.3.1.1: 62 qtext bytes plus two DQUOTEs equals 64 octets TEST(Email, valid_quoted_local_length_64) { EXPECT_TRUE(sourcemeta::core::is_email("\"" + std::string(62, 'a') + "\"@b")); + EXPECT_TRUE( + sourcemeta::core::is_idn_email("\"" + std::string(62, 'a') + "\"@b")); } // RFC 5321 §4.1.2: Quoted-string must terminate with DQUOTE TEST(Email, invalid_quoted_unterminated) { EXPECT_FALSE(sourcemeta::core::is_email("\"foo@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"foo@b")); } // RFC 5321 §4.1.2: extra content after closing DQUOTE is not Mailbox grammar TEST(Email, invalid_quoted_followed_by_atext) { EXPECT_FALSE(sourcemeta::core::is_email("\"a\"b@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"a\"b@c")); } // RFC 5321 §4.1.2: Mailbox grammar does not permit mixing Quoted-string and // Dot-string in a Local-part TEST(Email, invalid_quoted_followed_by_dot_atext) { EXPECT_FALSE(sourcemeta::core::is_email("\"a\".b@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"a\".b@c")); } // RFC 5321 §4.1.2: a Quoted-string cannot be preceded by atext TEST(Email, invalid_quoted_preceded_by_atext) { EXPECT_FALSE(sourcemeta::core::is_email("a\"b\"@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a\"b\"@c")); } // RFC 5321 §4.1.2: bare DQUOTE inside a Quoted-string closes it; the trailing // bytes break the Mailbox grammar TEST(Email, invalid_quoted_bare_dquote_inside) { EXPECT_FALSE(sourcemeta::core::is_email("\"a\"b\"@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"a\"b\"@c")); } // RFC 5321 §4.1.2: quoted-pairSMTP consumes the byte after "\\"; if that byte // is DQUOTE the Quoted-string is left unterminated TEST(Email, invalid_quoted_dangling_backslash) { EXPECT_FALSE(sourcemeta::core::is_email("\"a\\\"@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"a\\\"@b")); } // RFC 5321 §4.1.2: qtextSMTP excludes controls (%d0-31) TEST(Email, invalid_quoted_qtext_control_byte) { EXPECT_FALSE(sourcemeta::core::is_email("\"\x01\"@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\x01\"@b")); } // RFC 5321 §4.1.2: qtextSMTP excludes NUL TEST(Email, invalid_quoted_qtext_nul) { EXPECT_FALSE(sourcemeta::core::is_email(std::string_view{"\"\x00\"@b", 5})); + EXPECT_FALSE( + sourcemeta::core::is_idn_email(std::string_view{"\"\x00\"@b", 5})); } // RFC 5321 §4.1.2: qtextSMTP is ASCII, bytes >= 0x80 are excluded TEST(Email, invalid_quoted_qtext_high_bit) { EXPECT_FALSE(sourcemeta::core::is_email("\"\x80\"@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\x80\"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP body is %d32-126, controls are excluded TEST(Email, invalid_quoted_pair_control_byte) { EXPECT_FALSE(sourcemeta::core::is_email("\"\\\x01\"@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\\\x01\"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP body ends at %d126, DEL is excluded TEST(Email, invalid_quoted_pair_del_byte) { EXPECT_FALSE(sourcemeta::core::is_email("\"\\\x7f\"@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\\\x7f\"@b")); } // RFC 5321 §4.1.2: quoted-pairSMTP body is ASCII, bytes >= 0x80 are excluded TEST(Email, invalid_quoted_pair_high_bit_byte) { EXPECT_FALSE(sourcemeta::core::is_email("\"\\\x80\"@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\\\x80\"@b")); } // RFC 5321 §4.5.3.1.1: 63 qtext bytes plus two DQUOTEs equals 65 octets, one @@ -568,160 +684,193 @@ TEST(Email, invalid_quoted_pair_high_bit_byte) { TEST(Email, invalid_quoted_local_length_65) { EXPECT_FALSE( sourcemeta::core::is_email("\"" + std::string(63, 'a') + "\"@b")); + EXPECT_FALSE( + sourcemeta::core::is_idn_email("\"" + std::string(63, 'a') + "\"@b")); } // RFC 5321 §4.1.2: a Quoted-string may contain an unquoted "@" but the outer // boundary "@" is still required after the closing DQUOTE TEST(Email, valid_two_at_signs_one_quoted) { EXPECT_TRUE(sourcemeta::core::is_email("\"a@b\"@c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a@b\"@c")); } // RFC 5321 §4.1.2: two embedded "@" inside a Quoted-string are still qtextSMTP TEST(Email, valid_two_at_signs_quoted_with_atext) { EXPECT_TRUE(sourcemeta::core::is_email("\"x@y@z\"@d")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"x@y@z\"@d")); } // RFC 5321 §4.1.3: IPv4-address-literal Snum 3("." Snum) covers 0.0.0.0 TEST(Email, valid_ipv4_literal_zeros) { EXPECT_TRUE(sourcemeta::core::is_email("a@[0.0.0.0]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[0.0.0.0]")); } // RFC 5321 §4.1.3: IPv4-address-literal Snum max is 255 TEST(Email, valid_ipv4_literal_max) { EXPECT_TRUE(sourcemeta::core::is_email("a@[255.255.255.255]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[255.255.255.255]")); } // RFC 5321 §4.1.3: IPv4-address-literal typical RFC 1918 address TEST(Email, valid_ipv4_literal_typical) { EXPECT_TRUE(sourcemeta::core::is_email("a@[192.168.1.1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[192.168.1.1]")); } // RFC 5321 §4.1.3: IPv4-address-literal loopback TEST(Email, valid_ipv4_literal_loopback) { EXPECT_TRUE(sourcemeta::core::is_email("a@[127.0.0.1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[127.0.0.1]")); } // RFC 5321 §4.1.3: IPv4-address-literal one-digit Snum TEST(Email, valid_ipv4_literal_single_digit) { EXPECT_TRUE(sourcemeta::core::is_email("a@[1.2.3.4]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[1.2.3.4]")); } // RFC 5321 §4.1.3: IPv4-address-literal two-digit Snum TEST(Email, valid_ipv4_literal_two_digit) { EXPECT_TRUE(sourcemeta::core::is_email("a@[10.20.30.40]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[10.20.30.40]")); } // RFC 5321 §4.1.2: Domain accepts numeric-only labels without brackets TEST(Email, valid_domain_dotted_numeric) { EXPECT_TRUE(sourcemeta::core::is_email("a@1.2.3.4")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@1.2.3.4")); } // RFC 5321 §4.1.3: Snum value range 0-255, 256 is out of range TEST(Email, invalid_ipv4_octet_256) { EXPECT_FALSE(sourcemeta::core::is_email("a@[256.0.0.0]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[256.0.0.0]")); } // RFC 5321 §4.1.3: Snum value range 0-255, 999 is out of range TEST(Email, invalid_ipv4_octet_999) { EXPECT_FALSE(sourcemeta::core::is_email("a@[999.0.0.1]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[999.0.0.1]")); } // RFC 5321 §4.1.3: IPv4-address-literal requires exactly four Snum octets TEST(Email, invalid_ipv4_three_octets) { EXPECT_FALSE(sourcemeta::core::is_email("a@[1.2.3]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[1.2.3]")); } // RFC 5321 §4.1.3: IPv4-address-literal rejects five Snum octets TEST(Email, invalid_ipv4_five_octets) { EXPECT_FALSE(sourcemeta::core::is_email("a@[1.2.3.4.5]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[1.2.3.4.5]")); } // RFC 5321 §4.1.3: Snum = 1*3DIGIT, leading zero in multi-digit Snum is // rejected by is_ipv4 TEST(Email, invalid_ipv4_leading_zero) { EXPECT_FALSE(sourcemeta::core::is_email("a@[01.2.3.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[01.2.3.4]")); } // RFC 5321 §4.1.3: IPv4-address-literal cannot end with a "." TEST(Email, invalid_ipv4_trailing_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a@[1.2.3.4.]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[1.2.3.4.]")); } // RFC 5321 §4.1.3: IPv4-address-literal cannot begin with a "." TEST(Email, invalid_ipv4_leading_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a@[.1.2.3.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[.1.2.3.4]")); } // RFC 5321 §4.1.3: Snum = 1*3DIGIT, "-" is not a digit TEST(Email, invalid_ipv4_negative_octet) { EXPECT_FALSE(sourcemeta::core::is_email("a@[-1.2.3.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[-1.2.3.4]")); } // RFC 5321 §4.1.3: Snum = 1*3DIGIT, alphabetic bytes are not digits TEST(Email, invalid_ipv4_alpha_octet) { EXPECT_FALSE(sourcemeta::core::is_email("a@[1.2.a.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[1.2.a.4]")); } // RFC 5321 §4.1.3: address-literal requires a closing "]" TEST(Email, invalid_ipv4_missing_close_bracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@[1.2.3.4")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[1.2.3.4")); } // RFC 5321 §4.1.2 Domain: "]" is not in Ldh-str, so unbracketed forms fail TEST(Email, invalid_ipv4_missing_open_bracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@1.2.3.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@1.2.3.4]")); } // RFC 5321 §4.1.3: no content is permitted after the closing "]" TEST(Email, invalid_ipv4_trailing_garbage) { EXPECT_FALSE(sourcemeta::core::is_email("a@[1.2.3.4]x")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[1.2.3.4]x")); } // RFC 5321 §4.1.2 Domain: "[" is not in Ldh-str, so embedded brackets fail TEST(Email, invalid_ipv4_leading_garbage) { EXPECT_FALSE(sourcemeta::core::is_email("a@x[1.2.3.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@x[1.2.3.4]")); } // RFC 5321 §4.1.3: address-literal requires at least one of IPv4, IPv6, or // General-address-literal between the brackets TEST(Email, invalid_ipv4_empty_brackets) { EXPECT_FALSE(sourcemeta::core::is_email("a@[]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[]")); } // RFC 5321 §4.1.3: IPv6-address-literal = "IPv6:" IPv6-addr, loopback form TEST(Email, valid_ipv6_literal_loopback) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:::1]")); } // RFC 4291 §2.2: "::" compresses an all-zeros address TEST(Email, valid_ipv6_literal_all_zeros) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:::]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:::]")); } // RFC 5321 §4.1.3: IPv6-address-literal with one compressed group TEST(Email, valid_ipv6_literal_typical) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:2001:db8::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:2001:db8::1]")); } // RFC 4291 §2.2: IPv6-full form with eight 1-4 hex groups TEST(Email, valid_ipv6_literal_full_form) { EXPECT_TRUE(sourcemeta::core::is_email( "a@[IPv6:2001:0db8:0000:0000:0000:0000:0000:0001]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email( + "a@[IPv6:2001:0db8:0000:0000:0000:0000:0000:0001]")); } // RFC 4291 §2.5.6: link-local prefix fe80::/10 TEST(Email, valid_ipv6_literal_link_local) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:fe80::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:fe80::1]")); } // RFC 4291 §2.2: IPv4-mapped IPv6 address form TEST(Email, valid_ipv6_literal_v4_mapped) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:::ffff:192.168.1.1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:::ffff:192.168.1.1]")); } // RFC 4291 §2.5.5: IPv4-compatible IPv6 address form TEST(Email, valid_ipv6_literal_v4_compat) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:::192.168.1.1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:::192.168.1.1]")); } // RFC 5321 §4.1.3: without the "IPv6:" tag the literal is parsed as @@ -729,22 +878,26 @@ TEST(Email, valid_ipv6_literal_v4_compat) { // because ":" is in dcontent (%d58 is within %d33-90) TEST(Email, valid_no_ipv6_prefix_as_general) { EXPECT_TRUE(sourcemeta::core::is_email("a@[2001:db8::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[2001:db8::1]")); } // RFC 5234 §2.3: ABNF literal strings are case-insensitive by default, so the // "IPv6:" prefix matches "ipv6:" TEST(Email, valid_lowercase_ipv6_literal) { EXPECT_TRUE(sourcemeta::core::is_email("a@[ipv6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[ipv6:::1]")); } // RFC 5234 §2.3: case-insensitive literal also matches "IPV6:" TEST(Email, valid_uppercase_ipv6_literal) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPV6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPV6:::1]")); } // RFC 5234 §2.3: mixed-case prefix also matches the IPv6 tag TEST(Email, valid_mixed_case_ipv6_literal) { EXPECT_TRUE(sourcemeta::core::is_email("a@[iPv6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[iPv6:::1]")); } // RFC 5234 §3.2: ABNF alternatives are unordered. The literal five-byte @@ -752,152 +905,182 @@ TEST(Email, valid_mixed_case_ipv6_literal) { // through to General-address-literal with tag "IPv6" and content ":1" TEST(Email, valid_ipv6_prefix_no_colon_as_general) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6::1]")); } // RFC 5234 §3.2: a failed IPv6-addr match falls through to General-address- // literal with tag "IPv6" and content "not-an-address" (all dcontent) TEST(Email, valid_ipv6_body_garbage_as_general) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:not-an-address]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:not-an-address]")); } // RFC 5321 §4.1.3: IPv6-addr requires at least one group TEST(Email, invalid_ipv6_body_empty) { EXPECT_FALSE(sourcemeta::core::is_email("a@[IPv6:]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[IPv6:]")); } // RFC 5234 §3.2: nine groups fail IPv6-addr but the input still matches // General-address-literal with tag "IPv6" and content "1:2:3:4:5:6:7:8:9" TEST(Email, valid_ipv6_too_many_groups_as_general) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv6:1:2:3:4:5:6:7:8:9]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv6:1:2:3:4:5:6:7:8:9]")); } // RFC 5321 §4.1.3: address-literal needs a closing "]" TEST(Email, invalid_ipv6_missing_close_bracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@[IPv6:::1")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[IPv6:::1")); } // RFC 5321 §4.1.3: no content is permitted after the closing "]" TEST(Email, invalid_ipv6_trailing_garbage) { EXPECT_FALSE(sourcemeta::core::is_email("a@[IPv6:::1]x")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[IPv6:::1]x")); } // RFC 5321 §4.1.3: General-address-literal = Standardized-tag ":" 1*dcontent TEST(Email, valid_general_literal_minimal) { EXPECT_TRUE(sourcemeta::core::is_email("a@[X:y]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[X:y]")); } // RFC 5321 §4.1.3: typical X400 tag from the IANA Standardized-tag registry TEST(Email, valid_general_literal_x400) { EXPECT_TRUE(sourcemeta::core::is_email("a@[X400:foo]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[X400:foo]")); } // RFC 5321 §4.1.2: Ldh-str body permits DIGIT before the terminal Let-dig TEST(Email, valid_general_literal_tag_with_digits) { EXPECT_TRUE(sourcemeta::core::is_email("a@[tag1:foo]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[tag1:foo]")); } // RFC 5321 §4.1.2: Ldh-str body permits interior "-" TEST(Email, valid_general_literal_tag_with_interior_hyphen) { EXPECT_TRUE(sourcemeta::core::is_email("a@[tag-name:foo]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[tag-name:foo]")); } // RFC 5321 §4.1.2: Ldh-str = *( ALPHA / DIGIT / "-" ) Let-dig permits a // leading "-" because Standardized-tag is Ldh-str, not Let-dig [Ldh-str] TEST(Email, valid_general_literal_tag_leading_hyphen) { EXPECT_TRUE(sourcemeta::core::is_email("a@[-tag:foo]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[-tag:foo]")); } // RFC 5321 §4.1.3: dcontent starts at %d33 "!" TEST(Email, valid_general_literal_dcontent_lower_bound) { EXPECT_TRUE(sourcemeta::core::is_email("a@[Tag:!]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[Tag:!]")); } // RFC 5321 §4.1.3: dcontent %d33-90 ends at "Z" TEST(Email, valid_general_literal_dcontent_upper_first_range) { EXPECT_TRUE(sourcemeta::core::is_email("a@[Tag:Z]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[Tag:Z]")); } // RFC 5321 §4.1.3: dcontent %d94-126 starts at "^" TEST(Email, valid_general_literal_dcontent_lower_second_range) { EXPECT_TRUE(sourcemeta::core::is_email("a@[Tag:^]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[Tag:^]")); } // RFC 5321 §4.1.3: dcontent ends at "~" (%d126) TEST(Email, valid_general_literal_dcontent_upper_bound) { EXPECT_TRUE(sourcemeta::core::is_email("a@[Tag:~]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[Tag:~]")); } // RFC 5321 §4.1.3: dcontent %d33-90 includes ":" (%d58) TEST(Email, valid_general_literal_dcontent_with_colon) { EXPECT_TRUE(sourcemeta::core::is_email("a@[Tag:foo:bar]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[Tag:foo:bar]")); } // RFC 5321 §4.1.3: 1*dcontent permits long content TEST(Email, valid_general_literal_dcontent_long) { EXPECT_TRUE( sourcemeta::core::is_email("a@[Tag:" + std::string(200, 'a') + "]")); + EXPECT_TRUE( + sourcemeta::core::is_idn_email("a@[Tag:" + std::string(200, 'a') + "]")); } // RFC 5321 §4.1.2: Ldh-str must end with Let-dig, trailing "-" is invalid TEST(Email, invalid_general_tag_trailing_hyphen) { EXPECT_FALSE(sourcemeta::core::is_email("a@[tag-:x]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[tag-:x]")); } // RFC 5321 §4.1.2: Ldh-str requires a terminal Let-dig, lone "-" is invalid TEST(Email, invalid_general_tag_single_hyphen) { EXPECT_FALSE(sourcemeta::core::is_email("a@[-:x]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[-:x]")); } // RFC 5321 §4.1.2: Ldh-str alphabet excludes "_" TEST(Email, invalid_general_tag_with_underscore) { EXPECT_FALSE(sourcemeta::core::is_email("a@[tag_name:x]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[tag_name:x]")); } // RFC 5321 §4.1.2: Standardized-tag = Ldh-str, minimum length is one byte TEST(Email, invalid_general_tag_empty) { EXPECT_FALSE(sourcemeta::core::is_email("a@[:x]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[:x]")); } // RFC 5321 §4.1.3: General-address-literal = tag ":" 1*dcontent, empty content // is invalid TEST(Email, invalid_general_empty_dcontent) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400:]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400:]")); } // RFC 5321 §4.1.3: General-address-literal requires ":" between tag and // content TEST(Email, invalid_general_no_colon) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400foo]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400foo]")); } // RFC 5321 §4.1.3: dcontent excludes "[" (%d91) TEST(Email, invalid_general_dcontent_lbracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400:[]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400:[]")); } // RFC 5321 §4.1.3: dcontent excludes "\\" (%d92) TEST(Email, invalid_general_dcontent_backslash) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400:\\]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400:\\]")); } // RFC 5321 §4.1.3: dcontent excludes "]" (%d93) TEST(Email, invalid_general_dcontent_rbracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400:a]b]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400:a]b]")); } // RFC 5321 §4.1.3: dcontent excludes SP (%d32) TEST(Email, invalid_general_dcontent_space) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400:a b]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400:a b]")); } // RFC 5321 §4.1.3: dcontent excludes controls (%d0-31) TEST(Email, invalid_general_dcontent_control) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400:\x01]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400:\x01]")); } // RFC 5321 §4.1.3: dcontent is ASCII, bytes >= 0x80 are excluded TEST(Email, invalid_general_dcontent_high_bit) { EXPECT_FALSE(sourcemeta::core::is_email("a@[X400:\x80]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[X400:\x80]")); } // RFC 5321 §4.5.3.1: 64-byte Local-part plus "@" plus a 254-byte Domain still @@ -907,12 +1090,17 @@ TEST(Email, valid_max_length_email) { std::string(64, 'a') + "@" + std::string(63, 'b') + "." + std::string(63, 'c') + "." + std::string(63, 'd') + "." + std::string(62, 'e'))); + EXPECT_TRUE(sourcemeta::core::is_idn_email( + std::string(64, 'a') + "@" + std::string(63, 'b') + "." + + std::string(63, 'c') + "." + std::string(63, 'd') + "." + + std::string(62, 'e'))); } // RFC 5321 §4.5.3.1.1: 65-byte Local-part exceeds the cap even with a valid // Domain TEST(Email, invalid_local_65_with_valid_domain) { EXPECT_FALSE(sourcemeta::core::is_email(std::string(65, 'a') + "@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(std::string(65, 'a') + "@b")); } // RFC 5321 §4.5.3.1.2: a Domain exceeding the cap is rejected even with a @@ -922,37 +1110,47 @@ TEST(Email, invalid_local_64_with_domain_over_cap) { std::string(64, 'a') + "@" + std::string(63, 'b') + "." + std::string(63, 'c') + "." + std::string(63, 'd') + "." + std::string(63, 'e') + ".f")); + EXPECT_FALSE(sourcemeta::core::is_idn_email( + std::string(64, 'a') + "@" + std::string(63, 'b') + "." + + std::string(63, 'c') + "." + std::string(63, 'd') + "." + + std::string(63, 'e') + ".f")); } // RFC 5321 §4.1.2 + §4.1.3: Quoted-string Local-part with IPv4 address-literal TEST(Email, valid_quoted_local_with_ipv4_literal) { EXPECT_TRUE(sourcemeta::core::is_email("\"foo\"@[192.168.1.1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"foo\"@[192.168.1.1]")); } // RFC 5321 §4.1.2 + §4.1.3: Quoted-string Local-part with IPv6 address-literal TEST(Email, valid_quoted_local_with_ipv6_literal) { EXPECT_TRUE(sourcemeta::core::is_email("\"foo\"@[IPv6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"foo\"@[IPv6:::1]")); } // RFC 5321 §4.1.2 + §4.1.3: Quoted-string Local-part with General-address- // literal TEST(Email, valid_quoted_local_with_general_literal) { EXPECT_TRUE(sourcemeta::core::is_email("\"foo\"@[X400:bar]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"foo\"@[X400:bar]")); } // RFC 5321 §4.1.2 + §4.1.3: Dot-string Local-part with IPv4 address-literal TEST(Email, valid_dot_string_with_ipv4_literal) { EXPECT_TRUE(sourcemeta::core::is_email("foo@[192.168.1.1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("foo@[192.168.1.1]")); } // RFC 5321 §4.1.2 + §4.1.3: Dot-string Local-part with IPv6 address-literal TEST(Email, valid_dot_string_with_ipv6_literal) { EXPECT_TRUE(sourcemeta::core::is_email("foo@[IPv6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("foo@[IPv6:::1]")); } // RFC 5321 §4.1.2 + §4.1.3: Dot-string Local-part with General-address-literal TEST(Email, valid_dot_string_with_general_literal) { EXPECT_TRUE(sourcemeta::core::is_email("foo@[X400:bar]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("foo@[X400:bar]")); } // RFC 5321 §4.5.3.1.2: an address-literal whose total length equals the @@ -960,6 +1158,8 @@ TEST(Email, valid_dot_string_with_general_literal) { TEST(Email, valid_address_literal_length_255) { EXPECT_TRUE( sourcemeta::core::is_email("a@[X:" + std::string(251, 'a') + "]")); + EXPECT_TRUE( + sourcemeta::core::is_idn_email("a@[X:" + std::string(251, 'a') + "]")); } // RFC 5321 §4.5.3.1.2: an address-literal one octet past the 255-octet cap is @@ -967,164 +1167,202 @@ TEST(Email, valid_address_literal_length_255) { TEST(Email, invalid_address_literal_length_256) { EXPECT_FALSE( sourcemeta::core::is_email("a@[X:" + std::string(252, 'a') + "]")); + EXPECT_FALSE( + sourcemeta::core::is_idn_email("a@[X:" + std::string(252, 'a') + "]")); } // RFC 5321 §4.1.2: Mailbox cannot be empty TEST(Email, invalid_empty_input) { EXPECT_FALSE(sourcemeta::core::is_email("")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("")); } // RFC 5321 §4.1.2: SP is not in atext, qtextSMTP, or Ldh-str outside a // Quoted-string TEST(Email, invalid_whitespace_only) { EXPECT_FALSE(sourcemeta::core::is_email(" ")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(" ")); } // RFC 5321 §4.1.2: leading SP is not part of Dot-string or Quoted-string TEST(Email, invalid_leading_space) { EXPECT_FALSE(sourcemeta::core::is_email(" a@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(" a@b")); } // RFC 5321 §4.1.2 Domain: trailing SP is not in Ldh-str TEST(Email, invalid_trailing_space) { EXPECT_FALSE(sourcemeta::core::is_email("a@b ")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b ")); } // RFC 5321 §4.1.2 Domain: NUL is not in Ldh-str TEST(Email, invalid_nul_in_domain) { EXPECT_FALSE(sourcemeta::core::is_email(std::string_view{"a@b\x00c", 5})); + EXPECT_FALSE(sourcemeta::core::is_idn_email(std::string_view{"a@b\x00c", 5})); } // RFC 5321 §4.1.2: CRLF bytes are not in the Mailbox alphabet TEST(Email, invalid_crlf) { EXPECT_FALSE(sourcemeta::core::is_email("a@b\r\n")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b\r\n")); } // RFC 5321 §4.1.2: LF is not in the Mailbox alphabet -TEST(Email, invalid_lf) { EXPECT_FALSE(sourcemeta::core::is_email("a@b\n")); } +TEST(Email, invalid_lf) { + EXPECT_FALSE(sourcemeta::core::is_email("a@b\n")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b\n")); +} // RFC 5321 §4.1.2: TAB is not in atext or Ldh-str TEST(Email, invalid_tab_in_local) { EXPECT_FALSE(sourcemeta::core::is_email("a\tb@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a\tb@c")); } // RFC 5321 §4.1.2 Domain: TAB is not in Ldh-str TEST(Email, invalid_tab_in_domain) { EXPECT_FALSE(sourcemeta::core::is_email("a@b\tc")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b\tc")); } // RFC 5321 §4.1.2: two consecutive "@" produce an empty Local-part on the // left and a Dot-string Atom on the right that contains "@" TEST(Email, invalid_consecutive_at_signs) { EXPECT_FALSE(sourcemeta::core::is_email("a@@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@@b")); } // RFC 5321 §4.1.2: a single Quoted-string byte must still be followed by the // "@" boundary and a Domain TEST(Email, invalid_quoted_only) { EXPECT_FALSE(sourcemeta::core::is_email("\"a\"")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"a\"")); } // RFC 5321 §4.1.2: a lone DQUOTE is an unterminated Quoted-string TEST(Email, invalid_lone_dquote) { EXPECT_FALSE(sourcemeta::core::is_email("\"")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"")); } // RFC 5321 §4.1.2: a lone "@" has neither Local-part nor Domain -TEST(Email, invalid_lone_at) { EXPECT_FALSE(sourcemeta::core::is_email("@")); } +TEST(Email, invalid_lone_at) { + EXPECT_FALSE(sourcemeta::core::is_email("@")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("@")); +} // RFC 5321 §4.1.2: minimum-length Mailbox is a single atext byte plus "@" // plus a single Let-dig byte TEST(Email, valid_minimum_length_mailbox) { EXPECT_TRUE(sourcemeta::core::is_email("a@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@b")); } // RFC 5321 §4.1.3: a single dcontent byte is the minimum 1*dcontent TEST(Email, valid_general_literal_minimum_dcontent) { EXPECT_TRUE(sourcemeta::core::is_email("a@[A:B]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[A:B]")); } // RFC 5321 §4.1.2 ALPHA upper bound: "Z" (%d90) is in atext TEST(Email, valid_atext_alpha_upper_Z) { EXPECT_TRUE(sourcemeta::core::is_email("Z@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("Z@b")); } // RFC 5321 §4.1.2 ALPHA upper bound: "z" (%d122) is in atext TEST(Email, valid_atext_alpha_lower_z) { EXPECT_TRUE(sourcemeta::core::is_email("z@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("z@b")); } // RFC 5321 §4.1.2 DIGIT upper bound: "9" (%d57) is in atext TEST(Email, valid_atext_digit_nine) { EXPECT_TRUE(sourcemeta::core::is_email("9@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("9@b")); } // RFC 5321 §4.1.2: Atom = 1*atext, an all-digit atom is permitted TEST(Email, valid_dot_string_numeric_atom) { EXPECT_TRUE(sourcemeta::core::is_email("123@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("123@b")); } // RFC 5321 §4.1.2: atext mixes letters, digits, and specials in one atom TEST(Email, valid_dot_string_alpha_digit_special_mix) { EXPECT_TRUE(sourcemeta::core::is_email("aB1!c2#@d")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("aB1!c2#@d")); } // RFC 5321 §4.1.2: qtextSMTP %d93-126 excludes %d127 (DEL) TEST(Email, invalid_quoted_qtext_del_byte) { EXPECT_FALSE(sourcemeta::core::is_email("\"\x7f\"@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\x7f\"@b")); } // RFC 5321 §4.1.2: qtextSMTP excludes %d9 (TAB) TEST(Email, invalid_quoted_qtext_tab) { EXPECT_FALSE(sourcemeta::core::is_email("\"a\tb\"@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"a\tb\"@c")); } // RFC 5321 §4.1.2: quoted-pairSMTP body excludes NUL (%d0) TEST(Email, invalid_quoted_pair_nul_byte) { EXPECT_FALSE(sourcemeta::core::is_email(std::string_view{"\"\\\x00\"@b", 6})); + EXPECT_FALSE( + sourcemeta::core::is_idn_email(std::string_view{"\"\\\x00\"@b", 6})); } // RFC 5321 §4.1.2: quoted-pairSMTP body permits "@" (%d64) TEST(Email, valid_quoted_pair_at_sign) { EXPECT_TRUE(sourcemeta::core::is_email("\"\\@\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\\@\"@b")); } // RFC 5321 §4.1.2: two consecutive quoted-pairs back-to-back inside a // Quoted-string TEST(Email, valid_quoted_two_consecutive_pairs) { EXPECT_TRUE(sourcemeta::core::is_email("\"\\\\\\\"\"@b")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\\\\\\\"\"@b")); } // RFC 5321 §4.1.2 Domain: a single Let-dig digit is a valid sub-domain TEST(Email, valid_domain_single_digit) { EXPECT_TRUE(sourcemeta::core::is_email("a@1")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@1")); } // RFC 5321 §4.1.2: a single atext byte is not a valid Mailbox without "@" TEST(Email, invalid_single_atext_no_at) { EXPECT_FALSE(sourcemeta::core::is_email("a")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a")); } // RFC 5321 §4.1.2 Domain: a stray "]" with no opening "[" cannot match // address-literal and "]" is not in Ldh-str TEST(Email, invalid_unbalanced_closing_bracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@b]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b]")); } // RFC 5321 §4.1.2 Domain: "[" embedded in a Domain is not in Ldh-str TEST(Email, invalid_bracket_in_middle_of_domain) { EXPECT_FALSE(sourcemeta::core::is_email("a@b[c]d")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b[c]d")); } // RFC 5321 §4.1.3: a domain consisting of just "[" never closes the // address-literal TEST(Email, invalid_domain_just_open_bracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@[")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[")); } // RFC 5321 §4.1.2 Domain: a domain consisting of just "]" is not Ldh-str TEST(Email, invalid_domain_just_close_bracket) { EXPECT_FALSE(sourcemeta::core::is_email("a@]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@]")); } // RFC 5321 §4.1.3: "::1" between brackets has no "IPv6:" prefix and an empty @@ -1132,98 +1370,117 @@ TEST(Email, invalid_domain_just_close_bracket) { // reject it TEST(Email, invalid_bracket_just_ipv6_addr_no_prefix) { EXPECT_FALSE(sourcemeta::core::is_email("a@[::1]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[::1]")); } // RFC 5321 §4.1.3: a bracketed word without ":" cannot match General, and // without digits cannot match IPv4 TEST(Email, invalid_bracket_with_plain_word) { EXPECT_FALSE(sourcemeta::core::is_email("a@[hello]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[hello]")); } // RFC 5321 §4.1.3: leading SP inside the brackets fails IPv4 (non-digit) and // fails General (Standardized-tag has no SP) TEST(Email, invalid_bracket_with_leading_space) { EXPECT_FALSE(sourcemeta::core::is_email("a@[ 1.2.3.4]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[ 1.2.3.4]")); } // RFC 5321 §4.1.3 + RFC 5234 §3.2: a case-insensitive "IPv6:" match that // fails IPv6-addr still falls through to General-address-literal TEST(Email, valid_lowercase_ipv6_fallthrough_to_general) { EXPECT_TRUE(sourcemeta::core::is_email("a@[ipv6:not-an-address]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[ipv6:not-an-address]")); } // RFC 5321 §4.1.2 Ldh-str: leading "-" before another "-" before Let-dig is // still a valid Ldh-str TEST(Email, valid_general_literal_multiple_leading_hyphens) { EXPECT_TRUE(sourcemeta::core::is_email("a@[--a:b]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[--a:b]")); } // RFC 5321 §4.1.3: any Ldh-str is a valid Standardized-tag per the grammar, // including ones not registered with IANA such as "IPv7" TEST(Email, valid_general_literal_ipv7_like_tag) { EXPECT_TRUE(sourcemeta::core::is_email("a@[IPv7:foo]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[IPv7:foo]")); } // RFC 5321 §4.1.2 Ldh-str alphabet excludes ".", so a tag with "." cannot // match Standardized-tag TEST(Email, invalid_general_tag_with_dot) { EXPECT_FALSE(sourcemeta::core::is_email("a@[a.b:c]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[a.b:c]")); } // RFC 5321 §4.1.3: General-address-literal content of just ":" is a single // dcontent byte (%d58 is in %d33-90) TEST(Email, valid_general_literal_content_just_colon) { EXPECT_TRUE(sourcemeta::core::is_email("a@[a::]")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@[a::]")); } // RFC 5321 §4.1.3 + RFC 5234 §3.2: bracketed input where the first colon // produces an empty tag fails the Standardized-tag rule TEST(Email, invalid_bracket_empty_tag) { EXPECT_FALSE(sourcemeta::core::is_email("a@[:foo]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[:foo]")); } // RFC 5321 §4.1.2 Domain: an address-literal whose Domain branch tries // is_hostname must reject a stray "[" inside what would otherwise be Ldh-str TEST(Email, invalid_domain_open_bracket_inside) { EXPECT_FALSE(sourcemeta::core::is_email("a@b[c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b[c")); } // RFC 5321 §4.1.2 Dot-string: a Quoted-string opener "\"" inside an // otherwise Dot-string Local-part is not in atext TEST(Email, invalid_dquote_inside_dot_string) { EXPECT_FALSE(sourcemeta::core::is_email("a\"b@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a\"b@c")); } // RFC 5321 §4.1.2 Mailbox: a Mailbox cannot start with the boundary "@" TEST(Email, invalid_starts_with_at) { EXPECT_FALSE(sourcemeta::core::is_email("@example.com")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("@example.com")); } // RFC 5321 §4.1.2 Mailbox: a Mailbox cannot end with the boundary "@" TEST(Email, invalid_ends_with_at) { EXPECT_FALSE(sourcemeta::core::is_email("user@")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("user@")); } // RFC 5321 §4.1.2: Local-part = 64 octets via Dot-string that includes "." TEST(Email, valid_local_part_length_64_with_dots) { EXPECT_TRUE(sourcemeta::core::is_email(std::string(31, 'a') + "." + std::string(32, 'b') + "@c")); + EXPECT_TRUE(sourcemeta::core::is_idn_email(std::string(31, 'a') + "." + + std::string(32, 'b') + "@c")); } // RFC 5321 §4.5.3.1.1: 65-octet Dot-string Local-part that contains "." TEST(Email, invalid_local_part_length_65_with_dots) { EXPECT_FALSE(sourcemeta::core::is_email(std::string(32, 'a') + "." + std::string(32, 'b') + "@c")); + EXPECT_FALSE(sourcemeta::core::is_idn_email(std::string(32, 'a') + "." + + std::string(32, 'b') + "@c")); } // RFC 5321 §4.1.2: a Domain consisting of many short labels still parses TEST(Email, valid_domain_many_short_labels) { EXPECT_TRUE(sourcemeta::core::is_email("a@a.b.c.d.e.f.g.h.i.j.k")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@a.b.c.d.e.f.g.h.i.j.k")); } // RFC 5321 §4.1.3: dcontent excludes DEL (%d127) TEST(Email, invalid_general_dcontent_del_byte) { EXPECT_FALSE(sourcemeta::core::is_email("a@[Tag:\x7f]")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@[Tag:\x7f]")); } // RFC 5321 §4.1.3 + §4.5.3.1.2: a General-address-literal whose Domain total @@ -1232,6 +1489,8 @@ TEST(Email, invalid_general_dcontent_del_byte) { TEST(Email, valid_general_literal_inner_at_cap) { EXPECT_TRUE( sourcemeta::core::is_email("a@[Tag:" + std::string(249, 'x') + "]")); + EXPECT_TRUE( + sourcemeta::core::is_idn_email("a@[Tag:" + std::string(249, 'x') + "]")); } // RFC 5321 §4.5.3.1.2: General-address-literal one octet past the 255-octet @@ -1239,15 +1498,19 @@ TEST(Email, valid_general_literal_inner_at_cap) { TEST(Email, invalid_general_literal_inner_over_cap) { EXPECT_FALSE( sourcemeta::core::is_email("a@[Tag:" + std::string(250, 'x') + "]")); + EXPECT_FALSE( + sourcemeta::core::is_idn_email("a@[Tag:" + std::string(250, 'x') + "]")); } // RFC 5321 §4.1.2: a single quoted byte plus minimal Domain TEST(Email, valid_quoted_single_letter_then_minimal_domain) { EXPECT_TRUE(sourcemeta::core::is_email("\"x\"@y")); + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"x\"@y")); } // RFC 5321 §4.1.2: Dot-string ending with the boundary "@" right after the // dot has no terminating Atom TEST(Email, invalid_dot_then_at) { EXPECT_FALSE(sourcemeta::core::is_email("a.@b")); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a.@b")); } diff --git a/test/email/idn_email_test.cc b/test/email/idn_email_test.cc new file mode 100644 index 0000000000..51769b1514 --- /dev/null +++ b/test/email/idn_email_test.cc @@ -0,0 +1,399 @@ +#include + +#include + +#include + +// example@example.test rendered in Hangul (RFC 6531 §3.3) +// Bytes: 실=EC8BA4 례=EBA180 @=40 실=EC8BA4 례=EBA180 .=2E 테=ED858C 스=EC8AA4 +// 트=ED8AB8 +TEST(IdnEmail, valid_hangul_example_at_example_test) { + EXPECT_TRUE(sourcemeta::core::is_idn_email( + "\xec\x8b\xa4\xeb\xa1\x80" + "@" + "\xec\x8b\xa4\xeb\xa1\x80.\xed\x85\x8c\xec\x8a\xa4\xed\x8a\xb8")); + EXPECT_FALSE(sourcemeta::core::is_email( + "\xec\x8b\xa4\xeb\xa1\x80" + "@" + "\xec\x8b\xa4\xeb\xa1\x80.\xed\x85\x8c\xec\x8a\xa4\xed\x8a\xb8")); +} + +TEST(IdnEmail, invalid_bare_number) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("2962")); + EXPECT_FALSE(sourcemeta::core::is_email("2962")); +} + +TEST(IdnEmail, valid_typical_ascii_address) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("joe.bloggs@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email("joe.bloggs@example.com")); +} + +// RFC 5321 §4.1.2: ASCII Dot-string is a subset of the extended grammar +TEST(IdnEmail, valid_ascii_single_atom) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@b")); + EXPECT_TRUE(sourcemeta::core::is_email("a@b")); +} + +TEST(IdnEmail, valid_ascii_two_atoms) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("a.b@c")); + EXPECT_TRUE(sourcemeta::core::is_email("a.b@c")); +} + +TEST(IdnEmail, valid_ascii_many_atoms) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("a.b.c.d@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email("a.b.c.d@example.com")); +} + +TEST(IdnEmail, valid_ascii_atext_symbols) { + EXPECT_TRUE( + sourcemeta::core::is_idn_email("a!#$%&'*+-/=?^_`{|}~@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email("a!#$%&'*+-/=?^_`{|}~@example.com")); +} + +TEST(IdnEmail, valid_ascii_uppercase_local) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("ABC@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email("ABC@example.com")); +} + +TEST(IdnEmail, valid_ascii_digit_local) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("123@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email("123@example.com")); +} + +// RFC 6531 §3.3: atext =/ UTF8-non-ascii (2-byte: U+03B1 GREEK SMALL ALPHA) +TEST(IdnEmail, valid_local_two_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\xce\xb1@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xce\xb1@b")); +} + +// RFC 6531 §3.3: atext =/ UTF8-non-ascii (3-byte: U+4E2D CJK 中) +TEST(IdnEmail, valid_local_three_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\xe4\xb8\xad@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xe4\xb8\xad@b")); +} + +// RFC 6531 §3.3: atext =/ UTF8-non-ascii (4-byte: U+1F600 GRINNING FACE) +TEST(IdnEmail, valid_local_four_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\xf0\x9f\x98\x80@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xf0\x9f\x98\x80@b")); +} + +TEST(IdnEmail, valid_local_mixed_ascii_and_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("user.\xce\xb1@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("user.\xce\xb1@example.com")); +} + +TEST(IdnEmail, valid_local_multi_atom_with_utf8) { + EXPECT_TRUE( + sourcemeta::core::is_idn_email("\xe4\xb8\xad.\xce\xb1.user@example.com")); + EXPECT_FALSE( + sourcemeta::core::is_email("\xe4\xb8\xad.\xce\xb1.user@example.com")); +} + +TEST(IdnEmail, valid_local_utf8_only_two_atoms) { + EXPECT_TRUE( + sourcemeta::core::is_idn_email("\xce\xb1.\xe4\xb8\xad@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("\xce\xb1.\xe4\xb8\xad@example.com")); +} + +// RFC 6531 §3.3: sub-domain =/ U-label (2-byte U-label only) +TEST(IdnEmail, valid_domain_two_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@\xce\xb1")); + EXPECT_FALSE(sourcemeta::core::is_email("a@\xce\xb1")); +} + +// RFC 6531 §3.3: sub-domain =/ U-label (3-byte U-label only) +TEST(IdnEmail, valid_domain_three_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@\xe4\xb8\xad")); + EXPECT_FALSE(sourcemeta::core::is_email("a@\xe4\xb8\xad")); +} + +TEST(IdnEmail, valid_domain_mixed_labels) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@example.\xce\xb1.com")); + EXPECT_FALSE(sourcemeta::core::is_email("a@example.\xce\xb1.com")); +} + +TEST(IdnEmail, valid_domain_utf8_with_hyphen) { + // U-labels may contain hyphens; just not at the start/end of a label + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@\xce\xb1-\xe4\xb8\xad")); + EXPECT_FALSE(sourcemeta::core::is_email("a@\xce\xb1-\xe4\xb8\xad")); +} + +TEST(IdnEmail, valid_domain_many_labels) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@a.b.c.\xce\xb1.d.e")); + EXPECT_FALSE(sourcemeta::core::is_email("a@a.b.c.\xce\xb1.d.e")); +} + +// RFC 5321 §4.1.2: Quoted-string with ASCII-only content +TEST(IdnEmail, valid_quoted_ascii) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a b\"@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email("\"a b\"@example.com")); +} + +// RFC 6531 §3.3: qtextSMTP =/ UTF8-non-ascii +TEST(IdnEmail, valid_quoted_with_two_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\xce\xb1\"@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("\"\xce\xb1\"@example.com")); +} + +TEST(IdnEmail, valid_quoted_with_three_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"\xe4\xb8\xad\"@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("\"\xe4\xb8\xad\"@example.com")); +} + +TEST(IdnEmail, valid_quoted_with_four_byte_utf8) { + EXPECT_TRUE( + sourcemeta::core::is_idn_email("\"\xf0\x9f\x98\x80\"@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("\"\xf0\x9f\x98\x80\"@example.com")); +} + +TEST(IdnEmail, valid_quoted_mixed_ascii_and_utf8) { + EXPECT_TRUE( + sourcemeta::core::is_idn_email("\"\xce\xb1 \xe4\xb8\xad\"@example.com")); + EXPECT_FALSE( + sourcemeta::core::is_email("\"\xce\xb1 \xe4\xb8\xad\"@example.com")); +} + +TEST(IdnEmail, valid_quoted_with_quoted_pair) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\"a\\\"b\"@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email("\"a\\\"b\"@example.com")); +} + +// RFC 5321 §4.1.3: address-literal IPv4 stays ASCII (no IDNA applies) +TEST(IdnEmail, valid_address_literal_ipv4) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("user@[192.168.1.1]")); + EXPECT_TRUE(sourcemeta::core::is_email("user@[192.168.1.1]")); +} + +TEST(IdnEmail, valid_address_literal_ipv6) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("user@[IPv6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_email("user@[IPv6:::1]")); +} + +TEST(IdnEmail, valid_address_literal_ipv6_lowercase_tag) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("user@[ipv6:::1]")); + EXPECT_TRUE(sourcemeta::core::is_email("user@[ipv6:::1]")); +} + +TEST(IdnEmail, valid_address_literal_with_utf8_local) { + EXPECT_TRUE(sourcemeta::core::is_idn_email("\xce\xb1@[192.168.1.1]")); + EXPECT_FALSE(sourcemeta::core::is_email("\xce\xb1@[192.168.1.1]")); +} + +// RFC 5321 §4.5.3.1.1: Local-part is allowed up to 64 octets +TEST(IdnEmail, valid_local_at_octet_limit) { + const std::string local(64, 'a'); + EXPECT_TRUE(sourcemeta::core::is_idn_email(local + "@example.com")); + EXPECT_TRUE(sourcemeta::core::is_email(local + "@example.com")); +} + +TEST(IdnEmail, valid_local_at_octet_limit_with_utf8) { + // 21 Greek alpha (CE B1 = 2 bytes each) = 42 bytes, plus 22 ASCII 'a' = 64 + std::string local; + for (int index = 0; index < 21; ++index) { + local.append("\xce\xb1"); + } + local.append(22, 'a'); + EXPECT_EQ(local.size(), 64u); + EXPECT_TRUE(sourcemeta::core::is_idn_email(local + "@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email(local + "@example.com")); +} + +TEST(IdnEmail, invalid_missing_at) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("plain")); + EXPECT_FALSE(sourcemeta::core::is_email("plain")); +} + +TEST(IdnEmail, invalid_empty_local) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("@example.com")); +} + +TEST(IdnEmail, invalid_empty_domain) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("user@")); + EXPECT_FALSE(sourcemeta::core::is_email("user@")); +} + +TEST(IdnEmail, invalid_empty) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("")); + EXPECT_FALSE(sourcemeta::core::is_email("")); +} + +TEST(IdnEmail, invalid_two_at_signs) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@b@c")); + EXPECT_FALSE(sourcemeta::core::is_email("a@b@c")); +} + +TEST(IdnEmail, invalid_local_leading_dot) { + EXPECT_FALSE(sourcemeta::core::is_idn_email(".user@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email(".user@example.com")); +} + +TEST(IdnEmail, invalid_local_trailing_dot) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("user.@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("user.@example.com")); +} + +TEST(IdnEmail, invalid_local_consecutive_dots) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a..b@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("a..b@example.com")); +} + +TEST(IdnEmail, invalid_local_just_dot) { + EXPECT_FALSE(sourcemeta::core::is_idn_email(".@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email(".@example.com")); +} + +// RFC 6532 §3.1: lone continuation byte 0xBF is not the start of UTF-8 +TEST(IdnEmail, invalid_lone_continuation_byte) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xbf@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xbf@b")); +} + +// RFC 6532 §3.1: 2-byte starter with no continuation byte +TEST(IdnEmail, invalid_truncated_two_byte_at_end_of_local) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xce@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xce@b")); +} + +// RFC 6532 §3.1: %xE0 %x80-9F is overlong (codepoints < U+0800) +TEST(IdnEmail, invalid_overlong_three_byte) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xe0\x80\xa0@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xe0\x80\xa0@b")); +} + +// RFC 6532 §3.1: U+D800 surrogate is forbidden +TEST(IdnEmail, invalid_surrogate_codepoint_in_local) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xed\xa0\x80@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xed\xa0\x80@b")); +} + +// RFC 6532 §3.1: codepoints above U+10FFFF are forbidden +TEST(IdnEmail, invalid_above_max_codepoint_in_local) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xf4\x90\x80\x80@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xf4\x90\x80\x80@b")); +} + +// RFC 6532 §3.1: 4-byte starter with truncated continuation +TEST(IdnEmail, invalid_truncated_four_byte_in_local) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xf0\x9f\x98@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xf0\x9f\x98@b")); +} + +// RFC 6532 §3.1: %xC0 is a forbidden lead byte (overlong U+0000) +TEST(IdnEmail, invalid_overlong_c0_in_local) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xc0\x80@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xc0\x80@b")); +} + +// RFC 6532 §3.1: %xF5 is not a valid lead byte +TEST(IdnEmail, invalid_lead_f5_in_local) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\xf5\x80\x80\x80@b")); + EXPECT_FALSE(sourcemeta::core::is_email("\xf5\x80\x80\x80@b")); +} + +TEST(IdnEmail, invalid_invalid_utf8_in_domain) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@\xc0\x80")); + EXPECT_FALSE(sourcemeta::core::is_email("a@\xc0\x80")); +} + +TEST(IdnEmail, invalid_surrogate_in_domain) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@\xed\xa0\x80")); + EXPECT_FALSE(sourcemeta::core::is_email("a@\xed\xa0\x80")); +} + +TEST(IdnEmail, invalid_lone_continuation_in_domain) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@\xbf")); + EXPECT_FALSE(sourcemeta::core::is_email("a@\xbf")); +} + +TEST(IdnEmail, invalid_invalid_utf8_in_quoted) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\xc0\x80\"@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("\"\xc0\x80\"@example.com")); +} + +TEST(IdnEmail, invalid_truncated_utf8_in_quoted) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("\"\xce\"@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("\"\xce\"@example.com")); +} + +// RFC 6531 §3.3: domain label cannot start with a hyphen +TEST(IdnEmail, invalid_domain_leading_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@-example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("a@-example.com")); +} + +// RFC 6531 §3.3: domain label cannot end with a hyphen +TEST(IdnEmail, invalid_domain_trailing_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@example-.com")); + EXPECT_FALSE(sourcemeta::core::is_email("a@example-.com")); +} + +TEST(IdnEmail, invalid_domain_label_trailing_hyphen_with_utf8) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@\xce\xb1-")); + EXPECT_FALSE(sourcemeta::core::is_email("a@\xce\xb1-")); +} + +TEST(IdnEmail, invalid_domain_trailing_dot) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@example.com.")); + EXPECT_FALSE(sourcemeta::core::is_email("a@example.com.")); +} + +TEST(IdnEmail, invalid_domain_empty_label_in_middle) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@example..com")); + EXPECT_FALSE(sourcemeta::core::is_email("a@example..com")); +} + +TEST(IdnEmail, invalid_domain_leading_dot) { + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@.example.com")); + EXPECT_FALSE(sourcemeta::core::is_email("a@.example.com")); +} + +// RFC 5321 §4.5.3.1.1: Local-part > 64 octets is invalid +TEST(IdnEmail, invalid_local_one_over_octet_limit) { + const std::string local(65, 'a'); + EXPECT_FALSE(sourcemeta::core::is_idn_email(local + "@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email(local + "@example.com")); +} + +TEST(IdnEmail, invalid_local_one_over_octet_limit_with_utf8) { + // 21 alpha (42 bytes) + 23 'a' = 65 bytes + std::string local; + for (int index = 0; index < 21; ++index) { + local.append("\xce\xb1"); + } + local.append(23, 'a'); + EXPECT_EQ(local.size(), 65u); + EXPECT_FALSE(sourcemeta::core::is_idn_email(local + "@example.com")); + EXPECT_FALSE(sourcemeta::core::is_email(local + "@example.com")); +} + +// RFC 1035 §2.3.4: single label > 63 octets is invalid +TEST(IdnEmail, invalid_domain_label_too_long) { + const std::string label(64, 'a'); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@" + label)); + EXPECT_FALSE(sourcemeta::core::is_email("a@" + label)); +} + +TEST(IdnEmail, valid_domain_label_at_max_length) { + const std::string label(63, 'a'); + EXPECT_TRUE(sourcemeta::core::is_idn_email("a@" + label)); + EXPECT_TRUE(sourcemeta::core::is_email("a@" + label)); +} + +// RFC 5321 §4.5.3.1.2: total domain > 255 octets is invalid. +// Construction must avoid trailing-dot and per-label (>63) confounds: 5 +// labels of 51/51/51/51/48 'a' chars separated by 4 dots = 256 octets, no +// trailing dot, every label within the 63-octet RFC 1035 cap +TEST(IdnEmail, invalid_domain_total_too_long) { + std::string domain; + for (int index = 0; index < 4; ++index) { + domain.append(51, 'a'); + domain.push_back('.'); + } + domain.append(48, 'a'); + EXPECT_EQ(domain.size(), 256u); + EXPECT_NE(domain.back(), '.'); + EXPECT_FALSE(sourcemeta::core::is_idn_email("a@" + domain)); + EXPECT_FALSE(sourcemeta::core::is_email("a@" + domain)); +} diff --git a/test/unicode/CMakeLists.txt b/test/unicode/CMakeLists.txt index ca829ce501..18ef536775 100644 --- a/test/unicode/CMakeLists.txt +++ b/test/unicode/CMakeLists.txt @@ -1,5 +1,13 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME unicode - SOURCES unicode_test.cc) + SOURCES + codepoint_to_utf8_test.cc + utf8_to_utf32_test.cc + utf8_codepoint_length_test.cc + utf8_lead_byte_size_test.cc + utf8_codepoint_byte_count_test.cc + is_utf8_continuation_test.cc + is_surrogate_test.cc + is_valid_codepoint_test.cc) target_link_libraries(sourcemeta_core_unicode_unit PRIVATE sourcemeta::core::unicode) diff --git a/test/unicode/codepoint_to_utf8_test.cc b/test/unicode/codepoint_to_utf8_test.cc new file mode 100644 index 0000000000..f1212cfba0 --- /dev/null +++ b/test/unicode/codepoint_to_utf8_test.cc @@ -0,0 +1,125 @@ +#include + +#include + +#include // std::ostringstream +#include // std::string + +TEST(Unicode_codepoint_to_utf8, ascii_letter) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x41), "A"); +} + +TEST(Unicode_codepoint_to_utf8, ascii_null) { + const std::string expected(1, '\0'); + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x00), expected); +} + +TEST(Unicode_codepoint_to_utf8, ascii_max) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x7F), "\x7F"); +} + +TEST(Unicode_codepoint_to_utf8, two_byte_min) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x80), "\xC2\x80"); +} + +TEST(Unicode_codepoint_to_utf8, two_byte_latin_e_acute) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0xE9), "\xC3\xA9"); +} + +TEST(Unicode_codepoint_to_utf8, two_byte_max) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x7FF), "\xDF\xBF"); +} + +TEST(Unicode_codepoint_to_utf8, three_byte_min) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x800), "\xE0\xA0\x80"); +} + +TEST(Unicode_codepoint_to_utf8, three_byte_cjk) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x4E16), "\xE4\xB8\x96"); +} + +TEST(Unicode_codepoint_to_utf8, three_byte_max) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0xFFFF), "\xEF\xBF\xBF"); +} + +TEST(Unicode_codepoint_to_utf8, four_byte_min) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x10000), "\xF0\x90\x80\x80"); +} + +TEST(Unicode_codepoint_to_utf8, four_byte_emoji) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x1F600), "\xF0\x9F\x98\x80"); +} + +TEST(Unicode_codepoint_to_utf8, four_byte_max) { + EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x10FFFF), "\xF4\x8F\xBF\xBF"); +} + +TEST(Unicode_codepoint_to_utf8, stream_ascii_letter) { + std::ostringstream output; + sourcemeta::core::codepoint_to_utf8(0x41, output); + EXPECT_EQ(output.str(), "A"); +} + +TEST(Unicode_codepoint_to_utf8, stream_two_byte_latin_e_acute) { + std::ostringstream output; + sourcemeta::core::codepoint_to_utf8(0xE9, output); + EXPECT_EQ(output.str(), "\xC3\xA9"); +} + +TEST(Unicode_codepoint_to_utf8, stream_three_byte_cjk) { + std::ostringstream output; + sourcemeta::core::codepoint_to_utf8(0x4E16, output); + EXPECT_EQ(output.str(), "\xE4\xB8\x96"); +} + +TEST(Unicode_codepoint_to_utf8, stream_four_byte_emoji) { + std::ostringstream output; + sourcemeta::core::codepoint_to_utf8(0x1F600, output); + EXPECT_EQ(output.str(), "\xF0\x9F\x98\x80"); +} + +TEST(Unicode_codepoint_to_utf8, stream_multiple_codepoints) { + std::ostringstream output; + sourcemeta::core::codepoint_to_utf8(0x48, output); + sourcemeta::core::codepoint_to_utf8(0xE9, output); + sourcemeta::core::codepoint_to_utf8(0x1F600, output); + EXPECT_EQ(output.str(), "H\xC3\xA9\xF0\x9F\x98\x80"); +} + +TEST(Unicode_codepoint_to_utf8, string_ascii_letter) { + std::string output; + sourcemeta::core::codepoint_to_utf8(0x41, output); + EXPECT_EQ(output, "A"); +} + +TEST(Unicode_codepoint_to_utf8, string_two_byte_latin_e_acute) { + std::string output; + sourcemeta::core::codepoint_to_utf8(0xE9, output); + EXPECT_EQ(output, "\xC3\xA9"); +} + +TEST(Unicode_codepoint_to_utf8, string_three_byte_cjk) { + std::string output; + sourcemeta::core::codepoint_to_utf8(0x4E16, output); + EXPECT_EQ(output, "\xE4\xB8\x96"); +} + +TEST(Unicode_codepoint_to_utf8, string_four_byte_emoji) { + std::string output; + sourcemeta::core::codepoint_to_utf8(0x1F600, output); + EXPECT_EQ(output, "\xF0\x9F\x98\x80"); +} + +TEST(Unicode_codepoint_to_utf8, string_multiple_codepoints) { + std::string output; + sourcemeta::core::codepoint_to_utf8(0x48, output); + sourcemeta::core::codepoint_to_utf8(0xE9, output); + sourcemeta::core::codepoint_to_utf8(0x1F600, output); + EXPECT_EQ(output, "H\xC3\xA9\xF0\x9F\x98\x80"); +} + +TEST(Unicode_codepoint_to_utf8, string_four_byte_max) { + std::string output; + sourcemeta::core::codepoint_to_utf8(0x10FFFF, output); + EXPECT_EQ(output, "\xF4\x8F\xBF\xBF"); +} diff --git a/test/unicode/is_surrogate_test.cc b/test/unicode/is_surrogate_test.cc new file mode 100644 index 0000000000..7c0e243dcb --- /dev/null +++ b/test/unicode/is_surrogate_test.cc @@ -0,0 +1,55 @@ +#include + +#include + +TEST(Unicode_is_surrogate, null) { + EXPECT_FALSE(sourcemeta::core::is_surrogate(0x0000)); +} + +TEST(Unicode_is_surrogate, ascii_letter) { + EXPECT_FALSE(sourcemeta::core::is_surrogate(0x0041)); +} + +TEST(Unicode_is_surrogate, just_below_low_surrogate) { + EXPECT_FALSE(sourcemeta::core::is_surrogate(0xD7FF)); +} + +TEST(Unicode_is_surrogate, low_surrogate_low_boundary) { + EXPECT_TRUE(sourcemeta::core::is_surrogate(0xD800)); +} + +TEST(Unicode_is_surrogate, low_surrogate_mid) { + EXPECT_TRUE(sourcemeta::core::is_surrogate(0xDA00)); +} + +TEST(Unicode_is_surrogate, low_surrogate_high_boundary) { + EXPECT_TRUE(sourcemeta::core::is_surrogate(0xDBFF)); +} + +TEST(Unicode_is_surrogate, high_surrogate_low_boundary) { + EXPECT_TRUE(sourcemeta::core::is_surrogate(0xDC00)); +} + +TEST(Unicode_is_surrogate, high_surrogate_mid) { + EXPECT_TRUE(sourcemeta::core::is_surrogate(0xDE00)); +} + +TEST(Unicode_is_surrogate, high_surrogate_high_boundary) { + EXPECT_TRUE(sourcemeta::core::is_surrogate(0xDFFF)); +} + +TEST(Unicode_is_surrogate, just_above_high_surrogate) { + EXPECT_FALSE(sourcemeta::core::is_surrogate(0xE000)); +} + +TEST(Unicode_is_surrogate, max_bmp) { + EXPECT_FALSE(sourcemeta::core::is_surrogate(0xFFFF)); +} + +TEST(Unicode_is_surrogate, emoji_grinning_face) { + EXPECT_FALSE(sourcemeta::core::is_surrogate(0x1F600)); +} + +TEST(Unicode_is_surrogate, max_codepoint) { + EXPECT_FALSE(sourcemeta::core::is_surrogate(0x10FFFF)); +} diff --git a/test/unicode/is_utf8_continuation_test.cc b/test/unicode/is_utf8_continuation_test.cc new file mode 100644 index 0000000000..afc69a08a7 --- /dev/null +++ b/test/unicode/is_utf8_continuation_test.cc @@ -0,0 +1,51 @@ +#include + +#include + +TEST(Unicode_is_utf8_continuation, ascii_null_rejected) { + EXPECT_FALSE(sourcemeta::core::is_utf8_continuation(0x00)); +} + +TEST(Unicode_is_utf8_continuation, ascii_letter_rejected) { + EXPECT_FALSE(sourcemeta::core::is_utf8_continuation(0x41)); +} + +TEST(Unicode_is_utf8_continuation, ascii_high_boundary_rejected) { + EXPECT_FALSE(sourcemeta::core::is_utf8_continuation(0x7F)); +} + +TEST(Unicode_is_utf8_continuation, low_boundary) { + EXPECT_TRUE(sourcemeta::core::is_utf8_continuation(0x80)); +} + +TEST(Unicode_is_utf8_continuation, just_above_ascii) { + EXPECT_TRUE(sourcemeta::core::is_utf8_continuation(0x81)); +} + +TEST(Unicode_is_utf8_continuation, mid_range_a0) { + EXPECT_TRUE(sourcemeta::core::is_utf8_continuation(0xA0)); +} + +TEST(Unicode_is_utf8_continuation, mid_range_b0) { + EXPECT_TRUE(sourcemeta::core::is_utf8_continuation(0xB0)); +} + +TEST(Unicode_is_utf8_continuation, high_boundary) { + EXPECT_TRUE(sourcemeta::core::is_utf8_continuation(0xBF)); +} + +TEST(Unicode_is_utf8_continuation, two_byte_lead_rejected) { + EXPECT_FALSE(sourcemeta::core::is_utf8_continuation(0xC0)); +} + +TEST(Unicode_is_utf8_continuation, three_byte_lead_rejected) { + EXPECT_FALSE(sourcemeta::core::is_utf8_continuation(0xE0)); +} + +TEST(Unicode_is_utf8_continuation, four_byte_lead_rejected) { + EXPECT_FALSE(sourcemeta::core::is_utf8_continuation(0xF0)); +} + +TEST(Unicode_is_utf8_continuation, max_byte_rejected) { + EXPECT_FALSE(sourcemeta::core::is_utf8_continuation(0xFF)); +} diff --git a/test/unicode/is_valid_codepoint_test.cc b/test/unicode/is_valid_codepoint_test.cc new file mode 100644 index 0000000000..d47d2473c3 --- /dev/null +++ b/test/unicode/is_valid_codepoint_test.cc @@ -0,0 +1,67 @@ +#include + +#include + +TEST(Unicode_is_valid_codepoint, null) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0x0000)); +} + +TEST(Unicode_is_valid_codepoint, ascii_letter) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0x0041)); +} + +TEST(Unicode_is_valid_codepoint, ascii_high_boundary) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0x007F)); +} + +TEST(Unicode_is_valid_codepoint, latin_extended) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0x00E9)); +} + +TEST(Unicode_is_valid_codepoint, just_below_surrogate_range) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0xD7FF)); +} + +TEST(Unicode_is_valid_codepoint, low_surrogate_low_boundary_rejected) { + EXPECT_FALSE(sourcemeta::core::is_valid_codepoint(0xD800)); +} + +TEST(Unicode_is_valid_codepoint, low_surrogate_high_boundary_rejected) { + EXPECT_FALSE(sourcemeta::core::is_valid_codepoint(0xDBFF)); +} + +TEST(Unicode_is_valid_codepoint, high_surrogate_low_boundary_rejected) { + EXPECT_FALSE(sourcemeta::core::is_valid_codepoint(0xDC00)); +} + +TEST(Unicode_is_valid_codepoint, high_surrogate_high_boundary_rejected) { + EXPECT_FALSE(sourcemeta::core::is_valid_codepoint(0xDFFF)); +} + +TEST(Unicode_is_valid_codepoint, just_above_surrogate_range) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0xE000)); +} + +TEST(Unicode_is_valid_codepoint, max_bmp) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0xFFFF)); +} + +TEST(Unicode_is_valid_codepoint, smp_low_boundary) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0x10000)); +} + +TEST(Unicode_is_valid_codepoint, emoji_grinning_face) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0x1F600)); +} + +TEST(Unicode_is_valid_codepoint, max_codepoint) { + EXPECT_TRUE(sourcemeta::core::is_valid_codepoint(0x10FFFF)); +} + +TEST(Unicode_is_valid_codepoint, just_above_max_rejected) { + EXPECT_FALSE(sourcemeta::core::is_valid_codepoint(0x110000)); +} + +TEST(Unicode_is_valid_codepoint, far_above_max_rejected) { + EXPECT_FALSE(sourcemeta::core::is_valid_codepoint(0x1FFFFF)); +} diff --git a/test/unicode/unicode_test.cc b/test/unicode/unicode_test.cc deleted file mode 100644 index 6650920404..0000000000 --- a/test/unicode/unicode_test.cc +++ /dev/null @@ -1,228 +0,0 @@ -#include - -#include - -#include // std::istringstream, std::ostringstream -#include // std::string, std::u32string - -TEST(Unicode, codepoint_to_utf8_ascii_letter) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x41), "A"); -} - -TEST(Unicode, codepoint_to_utf8_ascii_null) { - const std::string expected(1, '\0'); - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x00), expected); -} - -TEST(Unicode, codepoint_to_utf8_ascii_max) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x7F), "\x7F"); -} - -TEST(Unicode, codepoint_to_utf8_two_byte_min) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x80), "\xC2\x80"); -} - -TEST(Unicode, codepoint_to_utf8_two_byte_latin_e_acute) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0xE9), "\xC3\xA9"); -} - -TEST(Unicode, codepoint_to_utf8_two_byte_max) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x7FF), "\xDF\xBF"); -} - -TEST(Unicode, codepoint_to_utf8_three_byte_min) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x800), "\xE0\xA0\x80"); -} - -TEST(Unicode, codepoint_to_utf8_three_byte_cjk) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x4E16), "\xE4\xB8\x96"); -} - -TEST(Unicode, codepoint_to_utf8_three_byte_max) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0xFFFF), "\xEF\xBF\xBF"); -} - -TEST(Unicode, codepoint_to_utf8_four_byte_min) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x10000), "\xF0\x90\x80\x80"); -} - -TEST(Unicode, codepoint_to_utf8_four_byte_emoji) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x1F600), "\xF0\x9F\x98\x80"); -} - -TEST(Unicode, codepoint_to_utf8_four_byte_max) { - EXPECT_EQ(sourcemeta::core::codepoint_to_utf8(0x10FFFF), "\xF4\x8F\xBF\xBF"); -} - -TEST(Unicode, codepoint_to_utf8_stream_ascii_letter) { - std::ostringstream output; - sourcemeta::core::codepoint_to_utf8(0x41, output); - EXPECT_EQ(output.str(), "A"); -} - -TEST(Unicode, codepoint_to_utf8_stream_two_byte_latin_e_acute) { - std::ostringstream output; - sourcemeta::core::codepoint_to_utf8(0xE9, output); - EXPECT_EQ(output.str(), "\xC3\xA9"); -} - -TEST(Unicode, codepoint_to_utf8_stream_three_byte_cjk) { - std::ostringstream output; - sourcemeta::core::codepoint_to_utf8(0x4E16, output); - EXPECT_EQ(output.str(), "\xE4\xB8\x96"); -} - -TEST(Unicode, codepoint_to_utf8_stream_four_byte_emoji) { - std::ostringstream output; - sourcemeta::core::codepoint_to_utf8(0x1F600, output); - EXPECT_EQ(output.str(), "\xF0\x9F\x98\x80"); -} - -TEST(Unicode, codepoint_to_utf8_stream_multiple_codepoints) { - std::ostringstream output; - sourcemeta::core::codepoint_to_utf8(0x48, output); - sourcemeta::core::codepoint_to_utf8(0xE9, output); - sourcemeta::core::codepoint_to_utf8(0x1F600, output); - EXPECT_EQ(output.str(), "H\xC3\xA9\xF0\x9F\x98\x80"); -} - -TEST(Unicode, codepoint_to_utf8_string_ascii_letter) { - std::string output; - sourcemeta::core::codepoint_to_utf8(0x41, output); - EXPECT_EQ(output, "A"); -} - -TEST(Unicode, codepoint_to_utf8_string_two_byte_latin_e_acute) { - std::string output; - sourcemeta::core::codepoint_to_utf8(0xE9, output); - EXPECT_EQ(output, "\xC3\xA9"); -} - -TEST(Unicode, codepoint_to_utf8_string_three_byte_cjk) { - std::string output; - sourcemeta::core::codepoint_to_utf8(0x4E16, output); - EXPECT_EQ(output, "\xE4\xB8\x96"); -} - -TEST(Unicode, codepoint_to_utf8_string_four_byte_emoji) { - std::string output; - sourcemeta::core::codepoint_to_utf8(0x1F600, output); - EXPECT_EQ(output, "\xF0\x9F\x98\x80"); -} - -TEST(Unicode, codepoint_to_utf8_string_multiple_codepoints) { - std::string output; - sourcemeta::core::codepoint_to_utf8(0x48, output); - sourcemeta::core::codepoint_to_utf8(0xE9, output); - sourcemeta::core::codepoint_to_utf8(0x1F600, output); - EXPECT_EQ(output, "H\xC3\xA9\xF0\x9F\x98\x80"); -} - -TEST(Unicode, codepoint_to_utf8_string_four_byte_max) { - std::string output; - sourcemeta::core::codepoint_to_utf8(0x10FFFF, output); - EXPECT_EQ(output, "\xF4\x8F\xBF\xBF"); -} - -TEST(Unicode, utf8_to_utf32_ascii) { - std::istringstream input{"Hello"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_TRUE(result.has_value()); - const std::u32string expected{0x48, 0x65, 0x6C, 0x6C, 0x6F}; - EXPECT_EQ(result.value(), expected); -} - -TEST(Unicode, utf8_to_utf32_empty) { - std::istringstream input{""}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_TRUE(result.has_value()); - EXPECT_TRUE(result.value().empty()); -} - -TEST(Unicode, utf8_to_utf32_two_byte) { - std::istringstream input{"\xC3\xA9"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_TRUE(result.has_value()); - const std::u32string expected{0xE9}; - EXPECT_EQ(result.value(), expected); -} - -TEST(Unicode, utf8_to_utf32_three_byte_cjk) { - std::istringstream input{"\xE4\xB8\x96"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_TRUE(result.has_value()); - const std::u32string expected{0x4E16}; - EXPECT_EQ(result.value(), expected); -} - -TEST(Unicode, utf8_to_utf32_four_byte_emoji) { - std::istringstream input{"\xF0\x9F\x98\x80"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_TRUE(result.has_value()); - const std::u32string expected{0x1F600}; - EXPECT_EQ(result.value(), expected); -} - -TEST(Unicode, utf8_to_utf32_mixed) { - std::istringstream input{"H\xC3\xA9\xE4\xB8\x96\xF0\x9F\x98\x80"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_TRUE(result.has_value()); - const std::u32string expected{0x48, 0xE9, 0x4E16, 0x1F600}; - EXPECT_EQ(result.value(), expected); -} - -TEST(Unicode, utf8_to_utf32_invalid_continuation) { - std::istringstream input{"\xC3\x28"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_FALSE(result.has_value()); -} - -TEST(Unicode, utf8_to_utf32_truncated_sequence) { - std::istringstream input{"\xE4\xB8"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_FALSE(result.has_value()); -} - -TEST(Unicode, utf8_to_utf32_overlong_encoding) { - std::istringstream input{"\xC0\x80"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_FALSE(result.has_value()); -} - -TEST(Unicode, utf8_to_utf32_surrogate_codepoint) { - std::istringstream input{"\xED\xA0\x80"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_FALSE(result.has_value()); -} - -TEST(Unicode, utf8_to_utf32_invalid_start_byte) { - std::istringstream input{"\xFF"}; - const auto result{sourcemeta::core::utf8_to_utf32(input)}; - EXPECT_FALSE(result.has_value()); -} - -TEST(Unicode, utf8_to_utf32_string_view_ascii) { - const auto result{sourcemeta::core::utf8_to_utf32("Hello")}; - EXPECT_TRUE(result.has_value()); - const std::u32string expected{0x48, 0x65, 0x6C, 0x6C, 0x6F}; - EXPECT_EQ(result.value(), expected); -} - -TEST(Unicode, utf8_to_utf32_string_view_empty) { - const auto result{sourcemeta::core::utf8_to_utf32("")}; - EXPECT_TRUE(result.has_value()); - EXPECT_TRUE(result.value().empty()); -} - -TEST(Unicode, utf8_to_utf32_string_view_mixed) { - const auto result{ - sourcemeta::core::utf8_to_utf32("H\xC3\xA9\xE4\xB8\x96\xF0\x9F\x98\x80")}; - EXPECT_TRUE(result.has_value()); - const std::u32string expected{0x48, 0xE9, 0x4E16, 0x1F600}; - EXPECT_EQ(result.value(), expected); -} - -TEST(Unicode, utf8_to_utf32_string_view_invalid) { - const auto result{sourcemeta::core::utf8_to_utf32("\xFF")}; - EXPECT_FALSE(result.has_value()); -} diff --git a/test/unicode/utf8_codepoint_byte_count_test.cc b/test/unicode/utf8_codepoint_byte_count_test.cc new file mode 100644 index 0000000000..64e9d532f2 --- /dev/null +++ b/test/unicode/utf8_codepoint_byte_count_test.cc @@ -0,0 +1,63 @@ +#include + +#include + +TEST(Unicode_utf8_codepoint_byte_count, ascii_null) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x0000), 1u); +} + +TEST(Unicode_utf8_codepoint_byte_count, ascii_letter) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x0041), 1u); +} + +TEST(Unicode_utf8_codepoint_byte_count, ascii_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x007F), 1u); +} + +TEST(Unicode_utf8_codepoint_byte_count, two_byte_low_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x0080), 2u); +} + +TEST(Unicode_utf8_codepoint_byte_count, two_byte_latin_e_acute) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x00E9), 2u); +} + +TEST(Unicode_utf8_codepoint_byte_count, two_byte_greek_alpha) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x03B1), 2u); +} + +TEST(Unicode_utf8_codepoint_byte_count, two_byte_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x07FF), 2u); +} + +TEST(Unicode_utf8_codepoint_byte_count, three_byte_low_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x0800), 3u); +} + +TEST(Unicode_utf8_codepoint_byte_count, three_byte_cjk) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x4E2D), 3u); +} + +TEST(Unicode_utf8_codepoint_byte_count, three_byte_korean_si) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0xC2E4), 3u); +} + +TEST(Unicode_utf8_codepoint_byte_count, three_byte_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0xFFFF), 3u); +} + +TEST(Unicode_utf8_codepoint_byte_count, four_byte_low_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x10000), 4u); +} + +TEST(Unicode_utf8_codepoint_byte_count, four_byte_emoji_grinning) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x1F600), 4u); +} + +TEST(Unicode_utf8_codepoint_byte_count, four_byte_smp_mid) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x40000), 4u); +} + +TEST(Unicode_utf8_codepoint_byte_count, four_byte_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_byte_count(0x10FFFF), 4u); +} diff --git a/test/unicode/utf8_codepoint_length_test.cc b/test/unicode/utf8_codepoint_length_test.cc new file mode 100644 index 0000000000..561dd1cb91 --- /dev/null +++ b/test/unicode/utf8_codepoint_length_test.cc @@ -0,0 +1,300 @@ +#include + +#include + +#include // std::string + +TEST(Unicode_utf8_codepoint_length, empty_input_returns_zero) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, position_at_size_returns_zero) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("A", 1), 0u); +} + +TEST(Unicode_utf8_codepoint_length, position_past_size_returns_zero) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("A", 5), 0u); +} + +TEST(Unicode_utf8_codepoint_length, ascii_letter_returns_one) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("A", 0), 1u); +} + +TEST(Unicode_utf8_codepoint_length, ascii_null_returns_one) { + const std::string null_byte(1, '\0'); + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length(null_byte, 0), 1u); +} + +TEST(Unicode_utf8_codepoint_length, ascii_low_boundary_returns_one) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\x01", 0), 1u); +} + +TEST(Unicode_utf8_codepoint_length, ascii_high_boundary_returns_one) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\x7f", 0), 1u); +} + +TEST(Unicode_utf8_codepoint_length, ascii_digit_returns_one) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("0", 0), 1u); +} + +TEST(Unicode_utf8_codepoint_length, ascii_space_returns_one) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length(" ", 0), 1u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_lead_low_boundary) { + // U+0080: \xC2\x80 (smallest 2-byte codepoint) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xc2\x80", 0), 2u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_lead_high_boundary) { + // U+07FF: \xDF\xBF (largest 2-byte codepoint) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xdf\xbf", 0), 2u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_greek_alpha) { + // U+03B1 GREEK SMALL ALPHA: \xCE\xB1 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xce\xb1", 0), 2u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_latin_e_acute) { + // U+00E9 LATIN SMALL E ACUTE: \xC3\xA9 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xc3\xa9", 0), 2u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_overlong_c0_zero) { + // %xC0 is forbidden (overlong encoding of U+0000) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xc0\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_overlong_c1_max) { + // %xC1 is forbidden (overlong encoding of U+007F) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xc1\xbf", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_invalid_tail_below_range) { + // UTF8-tail = %x80-BF; \x7F is below range + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xce\x7f", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_invalid_tail_above_range) { + // UTF8-tail = %x80-BF; \xC0 is above range + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xce\xc0", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_truncated) { + // Lead byte with no continuation + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xce", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_e0_low_boundary) { + // U+0800: \xE0\xA0\x80 (smallest 3-byte codepoint) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe0\xa0\x80", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_e0_high_boundary) { + // U+0FFF: \xE0\xBF\xBF + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe0\xbf\xbf", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_e0_overlong_low) { + // %xE0 %x80-9F is overlong (codepoints < U+0800) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe0\x80\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_e0_overlong_boundary) { + // %xE0 %x9F is just below the valid %xA0 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe0\x9f\xbf", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_e1_letter) { + // U+1000 MYANMAR: \xE1\x80\x80 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe1\x80\x80", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_e4_cjk) { + // U+4E2D CJK 中: \xE4\xB8\xAD + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe4\xb8\xad", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_ec_high_boundary) { + // %xEC range is fully open: \xEC\xBF\xBF (U+CFFF) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xec\xbf\xbf", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_ec_korean_si) { + // U+C2E4 실 (Hangul SI): \xEC\x8B\xA4 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xec\x8b\xa4", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_ed_low_boundary) { + // U+D000: \xED\x80\x80 (just below surrogate range) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xed\x80\x80", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_ed_high_boundary) { + // U+D7FF: \xED\x9F\xBF (the last codepoint before the surrogate range) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xed\x9f\xbf", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_surrogate_low) { + // U+D800: \xED\xA0\x80 (forbidden surrogate range) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xed\xa0\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_surrogate_high) { + // U+DFFF: \xED\xBF\xBF (forbidden surrogate range) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xed\xbf\xbf", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_ee_low_boundary) { + // U+E000 (first codepoint after the surrogate range): \xEE\x80\x80 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xee\x80\x80", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_ef_high_boundary) { + // U+FFFF (largest 3-byte codepoint): \xEF\xBF\xBF + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xef\xbf\xbf", 0), 3u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_truncated_one_byte) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe4", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_truncated_two_bytes) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe4\xb8", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_invalid_third_byte) { + // Third byte must be %x80-BF + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xe4\xb8\x7f", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f0_low_boundary) { + // U+10000 (smallest 4-byte codepoint): \xF0\x90\x80\x80 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x90\x80\x80", 0), 4u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f0_high_boundary) { + // %xF0 %xBF\xBF\xBF (U+3FFFF) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\xbf\xbf\xbf", 0), 4u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f0_overlong_low) { + // %xF0 %x80-8F is overlong (codepoints < U+10000) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x80\x80\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f0_overlong_high_boundary) { + // %xF0 %x8F is just below the valid %x90 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x8f\xbf\xbf", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f1_low) { + // U+40000: \xF1\x80\x80\x80 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf1\x80\x80\x80", 0), 4u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f3_high) { + // U+FFFFF: \xF3\xBF\xBF\xBF + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf3\xbf\xbf\xbf", 0), 4u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_emoji_grinning) { + // U+1F600 😀: \xF0\x9F\x98\x80 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x9f\x98\x80", 0), 4u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f4_low_boundary) { + // U+100000: \xF4\x80\x80\x80 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf4\x80\x80\x80", 0), 4u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f4_high_boundary) { + // U+10FFFF (last valid Unicode codepoint): \xF4\x8F\xBF\xBF + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf4\x8f\xbf\xbf", 0), 4u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_above_max_codepoint) { + // %xF4 %x90+ would encode codepoints > U+10FFFF (forbidden) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf4\x90\x80\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_above_max_high) { + // %xF4 %xBF\xBF\xBF would encode U+13FFFF (forbidden) + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf4\xbf\xbf\xbf", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_truncated_one_byte) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_truncated_two_bytes) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x9f", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_truncated_three_bytes) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x9f\x98", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_invalid_third) { + // Third byte must be %x80-BF + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x9f\x7f\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_invalid_fourth) { + // Fourth byte must be %x80-BF + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf0\x9f\x98\xc0", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_f5_forbidden_lead) { + // %xF5-FF are not valid lead bytes per RFC 6532 §3.1 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xf5\x80\x80\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, lead_byte_fe_forbidden) { + // %xFE is not a valid lead byte + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xfe\x80\x80\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, lead_byte_ff_forbidden) { + // %xFF is not a valid lead byte + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xff", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, lone_continuation_low) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\x80", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, lone_continuation_high) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xbf", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, lone_continuation_middle) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("\xa0", 0), 0u); +} + +TEST(Unicode_utf8_codepoint_length, ascii_at_offset) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("ABC", 1), 1u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_at_offset) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("xy\xce\xb1z", 2), 2u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_at_offset) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("xy\xe4\xb8\xad", 2), 3u); +} + +TEST(Unicode_utf8_codepoint_length, four_byte_at_offset) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("xy\xf0\x9f\x98\x80", 2), + 4u); +} + +TEST(Unicode_utf8_codepoint_length, two_byte_truncated_at_offset) { + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("xy\xce", 2), 0u); +} + +TEST(Unicode_utf8_codepoint_length, three_byte_truncated_at_offset) { + // Only one byte after the lead at position 2 + EXPECT_EQ(sourcemeta::core::utf8_codepoint_length("xy\xe4\xb8", 2), 0u); +} diff --git a/test/unicode/utf8_lead_byte_size_test.cc b/test/unicode/utf8_lead_byte_size_test.cc new file mode 100644 index 0000000000..d7b3b7a3d5 --- /dev/null +++ b/test/unicode/utf8_lead_byte_size_test.cc @@ -0,0 +1,87 @@ +#include + +#include + +TEST(Unicode_utf8_lead_byte_size, ascii_null) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0x00), 1u); +} + +TEST(Unicode_utf8_lead_byte_size, ascii_letter) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0x41), 1u); +} + +TEST(Unicode_utf8_lead_byte_size, ascii_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0x7F), 1u); +} + +TEST(Unicode_utf8_lead_byte_size, continuation_low_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0x80), 0u); +} + +TEST(Unicode_utf8_lead_byte_size, continuation_mid) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xA0), 0u); +} + +TEST(Unicode_utf8_lead_byte_size, continuation_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xBF), 0u); +} + +TEST(Unicode_utf8_lead_byte_size, overlong_c0) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xC0), 0u); +} + +TEST(Unicode_utf8_lead_byte_size, overlong_c1) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xC1), 0u); +} + +TEST(Unicode_utf8_lead_byte_size, two_byte_low_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xC2), 2u); +} + +TEST(Unicode_utf8_lead_byte_size, two_byte_mid) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xCE), 2u); +} + +TEST(Unicode_utf8_lead_byte_size, two_byte_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xDF), 2u); +} + +TEST(Unicode_utf8_lead_byte_size, three_byte_low_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xE0), 3u); +} + +TEST(Unicode_utf8_lead_byte_size, three_byte_mid_e4) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xE4), 3u); +} + +TEST(Unicode_utf8_lead_byte_size, three_byte_ed) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xED), 3u); +} + +TEST(Unicode_utf8_lead_byte_size, three_byte_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xEF), 3u); +} + +TEST(Unicode_utf8_lead_byte_size, four_byte_low_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xF0), 4u); +} + +TEST(Unicode_utf8_lead_byte_size, four_byte_mid_f2) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xF2), 4u); +} + +TEST(Unicode_utf8_lead_byte_size, four_byte_high_boundary) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xF4), 4u); +} + +TEST(Unicode_utf8_lead_byte_size, above_range_f5) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xF5), 0u); +} + +TEST(Unicode_utf8_lead_byte_size, above_range_fe) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xFE), 0u); +} + +TEST(Unicode_utf8_lead_byte_size, above_range_ff) { + EXPECT_EQ(sourcemeta::core::utf8_lead_byte_size(0xFF), 0u); +} diff --git a/test/unicode/utf8_to_utf32_test.cc b/test/unicode/utf8_to_utf32_test.cc new file mode 100644 index 0000000000..907c6d012c --- /dev/null +++ b/test/unicode/utf8_to_utf32_test.cc @@ -0,0 +1,109 @@ +#include + +#include + +#include // std::istringstream +#include // std::u32string + +TEST(Unicode_utf8_to_utf32, ascii) { + std::istringstream input{"Hello"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_TRUE(result.has_value()); + const std::u32string expected{0x48, 0x65, 0x6C, 0x6C, 0x6F}; + EXPECT_EQ(result.value(), expected); +} + +TEST(Unicode_utf8_to_utf32, empty) { + std::istringstream input{""}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_TRUE(result.has_value()); + EXPECT_TRUE(result.value().empty()); +} + +TEST(Unicode_utf8_to_utf32, two_byte) { + std::istringstream input{"\xC3\xA9"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_TRUE(result.has_value()); + const std::u32string expected{0xE9}; + EXPECT_EQ(result.value(), expected); +} + +TEST(Unicode_utf8_to_utf32, three_byte_cjk) { + std::istringstream input{"\xE4\xB8\x96"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_TRUE(result.has_value()); + const std::u32string expected{0x4E16}; + EXPECT_EQ(result.value(), expected); +} + +TEST(Unicode_utf8_to_utf32, four_byte_emoji) { + std::istringstream input{"\xF0\x9F\x98\x80"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_TRUE(result.has_value()); + const std::u32string expected{0x1F600}; + EXPECT_EQ(result.value(), expected); +} + +TEST(Unicode_utf8_to_utf32, mixed) { + std::istringstream input{"H\xC3\xA9\xE4\xB8\x96\xF0\x9F\x98\x80"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_TRUE(result.has_value()); + const std::u32string expected{0x48, 0xE9, 0x4E16, 0x1F600}; + EXPECT_EQ(result.value(), expected); +} + +TEST(Unicode_utf8_to_utf32, invalid_continuation) { + std::istringstream input{"\xC3\x28"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_FALSE(result.has_value()); +} + +TEST(Unicode_utf8_to_utf32, truncated_sequence) { + std::istringstream input{"\xE4\xB8"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_FALSE(result.has_value()); +} + +TEST(Unicode_utf8_to_utf32, overlong_encoding) { + std::istringstream input{"\xC0\x80"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_FALSE(result.has_value()); +} + +TEST(Unicode_utf8_to_utf32, surrogate_codepoint) { + std::istringstream input{"\xED\xA0\x80"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_FALSE(result.has_value()); +} + +TEST(Unicode_utf8_to_utf32, invalid_start_byte) { + std::istringstream input{"\xFF"}; + const auto result{sourcemeta::core::utf8_to_utf32(input)}; + EXPECT_FALSE(result.has_value()); +} + +TEST(Unicode_utf8_to_utf32, string_view_ascii) { + const auto result{sourcemeta::core::utf8_to_utf32("Hello")}; + EXPECT_TRUE(result.has_value()); + const std::u32string expected{0x48, 0x65, 0x6C, 0x6C, 0x6F}; + EXPECT_EQ(result.value(), expected); +} + +TEST(Unicode_utf8_to_utf32, string_view_empty) { + const auto result{sourcemeta::core::utf8_to_utf32("")}; + EXPECT_TRUE(result.has_value()); + EXPECT_TRUE(result.value().empty()); +} + +TEST(Unicode_utf8_to_utf32, string_view_mixed) { + const auto result{ + sourcemeta::core::utf8_to_utf32("H\xC3\xA9\xE4\xB8\x96\xF0\x9F\x98\x80")}; + EXPECT_TRUE(result.has_value()); + const std::u32string expected{0x48, 0xE9, 0x4E16, 0x1F600}; + EXPECT_EQ(result.value(), expected); +} + +TEST(Unicode_utf8_to_utf32, string_view_invalid) { + const auto result{sourcemeta::core::utf8_to_utf32("\xFF")}; + EXPECT_FALSE(result.has_value()); +}