diff --git a/config.cmake.in b/config.cmake.in index 4fbda765e..5245c135e 100644 --- a/config.cmake.in +++ b/config.cmake.in @@ -69,6 +69,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) elseif(component STREQUAL "ip") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake") elseif(component STREQUAL "dns") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_dns.cmake") elseif(component STREQUAL "email") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake") diff --git a/src/core/dns/CMakeLists.txt b/src/core/dns/CMakeLists.txt index 3c61a4208..b29923201 100644 --- a/src/core/dns/CMakeLists.txt +++ b/src/core/dns/CMakeLists.txt @@ -4,3 +4,6 @@ sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME dns if(SOURCEMETA_CORE_INSTALL) sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME dns) endif() + +target_link_libraries(sourcemeta_core_dns + PRIVATE sourcemeta::core::unicode) diff --git a/src/core/dns/hostname.cc b/src/core/dns/hostname.cc index 4692b85fc..6864fd736 100644 --- a/src/core/dns/hostname.cc +++ b/src/core/dns/hostname.cc @@ -1,5 +1,7 @@ #include +#include + namespace sourcemeta::core { // RFC 952 §B: let-dig = ALPHA / DIGIT @@ -10,12 +12,10 @@ static constexpr auto is_let_dig(const char character) -> bool { (character >= '0' && character <= '9'); } -// RFC 952 §B: let-dig-hyp = ALPHA / DIGIT / "-" -static constexpr auto is_let_dig_hyp(const char character) -> bool { - return is_let_dig(character) || character == '-'; -} - -auto is_hostname(const std::string_view value) -> bool { +// RFC 1123 §2.1: hostname grammar. When AllowUtf8 is true, RFC 5890 §2.3.2.3 +// extends each label with UTF-8 non-ASCII bytes (RFC 6532 §3.1) +template +static auto is_hostname_impl(const std::string_view value) -> bool { // RFC 952 §B: requires at least one if (value.empty()) { return false; @@ -27,42 +27,57 @@ auto is_hostname(const std::string_view value) -> bool { } std::string_view::size_type position{0}; - while (position < value.size()) { const auto label_start{position}; - - // RFC 1123 §2.1: first character is letter or digit - if (!is_let_dig(value[position])) { - return false; - } - position += 1; + bool last_was_hyphen{false}; + bool label_has_content{false}; while (position < value.size() && value[position] != '.') { - // RFC 952 §B: interior characters are let-dig-hyp - if (!is_let_dig_hyp(value[position])) { - return false; + const auto character{value[position]}; + if (character == '-') { + // RFC 1123 §2.1: first character must be let-dig, never hyphen + if (!label_has_content) { + return false; + } + last_was_hyphen = true; + position += 1; + label_has_content = true; + continue; } - position += 1; - } - const auto label_length{position - label_start}; + if (is_let_dig(character)) { + last_was_hyphen = false; + position += 1; + label_has_content = true; + continue; + } - // RFC 1123 §2.1: MUST handle host names of up to 63 characters (per label) - if (label_length > 63) { - return false; + if constexpr (AllowUtf8) { + // RFC 5890 §2.3.2.3 / RFC 6532 §3.1: UTF-8 non-ASCII codepoint as a + // U-label byte + const auto utf8_length{utf8_codepoint_length(value, position)}; + if (utf8_length < 2) { + return false; + } + last_was_hyphen = false; + position += utf8_length; + label_has_content = true; + } else { + return false; + } } - // RFC 952 §B + ASSUMPTIONS: last character must not be a minus sign - if (value[position - 1] == '-') { + // RFC 1035 §2.3.4: per-label cap is 63 octets + const auto label_length{position - label_start}; + if (label_length == 0 || label_length > 63 || last_was_hyphen) { return false; } - // If we stopped on a dot, there must be another label following it if (position < value.size()) { // value[position] == '.' position += 1; - // Trailing dot: JSON Schema test suite requires rejection (TS d7+ #15) - if (position >= value.size()) { + if (position == value.size()) { + // Trailing dot is not part of the host name grammar return false; } } @@ -71,4 +86,12 @@ auto is_hostname(const std::string_view value) -> bool { return true; } +auto is_hostname(const std::string_view value) -> bool { + return is_hostname_impl(value); +} + +auto is_idn_hostname(const std::string_view value) -> bool { + return is_hostname_impl(value); +} + } // namespace sourcemeta::core diff --git a/src/core/dns/include/sourcemeta/core/dns.h b/src/core/dns/include/sourcemeta/core/dns.h index ff827b0cb..b464bb04c 100644 --- a/src/core/dns/include/sourcemeta/core/dns.h +++ b/src/core/dns/include/sourcemeta/core/dns.h @@ -21,8 +21,7 @@ namespace sourcemeta::core { /// @ingroup dns /// Check whether the given string is a valid Internet host name per /// RFC 1123 Section 2.1, which relaxes the first-character rule of -/// RFC 952 to allow either a letter or a digit. This matches the -/// definition used by the JSON Schema `hostname` format. For example: +/// RFC 952 to allow either a letter or a digit. For example: /// /// ```cpp /// #include @@ -36,11 +35,36 @@ namespace sourcemeta::core { /// ``` /// /// This function implements RFC 1123 §2.1 (ASCII only). It does not -/// perform A-label or Punycode decoding. Those belong to the separate -/// `idn-hostname` format. +/// perform A-label or Punycode decoding. For internationalized host +/// names see `is_idn_hostname`. SOURCEMETA_CORE_DNS_EXPORT auto is_hostname(const std::string_view value) -> bool; +/// @ingroup dns +/// Check whether the given string is a valid internationalized host name. +/// Accepts every input that `is_hostname` accepts, and additionally allows +/// each label to contain valid UTF-8 non-ASCII byte sequences (RFC 6532 +/// Section 3.1), modelling the U-label extension of RFC 5890 Section +/// 2.3.2.3. For example: +/// +/// ```cpp +/// #include +/// +/// #include +/// +/// assert(sourcemeta::core::is_idn_hostname("www.example.com")); +/// assert(sourcemeta::core::is_idn_hostname( +/// "\xec\x8b\xa4\xeb\xa1\x80.\xed\x85\x8c\xec\x8a\xa4\xed\x8a\xb8")); +/// assert(!sourcemeta::core::is_idn_hostname("-bad")); +/// ``` +/// +/// This is a best-effort lexical check: it accepts the byte-level structure +/// of an RFC 5890 U-label but does not perform full IDNA2008 validation +/// (no NFC normalization, no Bidi rule, no ContextJ/O checks, no Punycode +/// round-trip). +SOURCEMETA_CORE_DNS_EXPORT +auto is_idn_hostname(const std::string_view value) -> bool; + } // namespace sourcemeta::core #endif diff --git a/src/core/email/email.cc b/src/core/email/email.cc index c3eb899cf..51a578ff3 100644 --- a/src/core/email/email.cc +++ b/src/core/email/email.cc @@ -1,6 +1,7 @@ #include #include +#include #include "helpers.h" @@ -115,7 +116,7 @@ static auto is_mailbox(const std::string_view value) -> bool { if constexpr (AllowUtf8) { // RFC 6531 §3.3: sub-domain =/ U-label - return is_idn_domain(domain); + return is_idn_hostname(domain); } else { // RFC 5321 §4.1.2 Domain matches is_hostname (RFC 1123 §2.1) by // grammar, by 63-octet label cap (RFC 1035 §2.3.4), and by diff --git a/src/core/email/helpers.h b/src/core/email/helpers.h index 5cc9b822b..1b900eebf 100644 --- a/src/core/email/helpers.h +++ b/src/core/email/helpers.h @@ -2,7 +2,6 @@ #define SOURCEMETA_CORE_EMAIL_HELPERS_H_ #include -#include #include // std::string_view @@ -132,69 +131,6 @@ inline auto is_address_literal(const std::string_view domain) -> bool { return is_general_address_literal(inner); } -// TODO: Move to src/core/dns - -// RFC 6531 §3.3: sub-domain =/ U-label -// Relaxed sub-domain grammar where each label is a non-empty sequence of -// LetDig / hyphen / UTF8-non-ascii bytes, with no leading or trailing hyphen, -// length limits per RFC 5321 §4.5.3.1.2 and RFC 1035 §2.3.4 -inline auto is_idn_domain(const std::string_view value) -> bool { - if (value.empty() || value.size() > 255) { - return false; - } - - std::string_view::size_type position{0}; - while (position < value.size()) { - const auto label_start{position}; - bool last_was_hyphen{false}; - bool label_has_content{false}; - - while (position < value.size() && value[position] != '.') { - const auto character{value[position]}; - if (character == '-') { - if (!label_has_content) { - return false; - } - last_was_hyphen = true; - position += 1; - label_has_content = true; - continue; - } - - if (is_let_dig(character)) { - last_was_hyphen = false; - position += 1; - label_has_content = true; - continue; - } - - const auto utf8_length{ - sourcemeta::core::utf8_codepoint_length(value, position)}; - if (utf8_length < 2) { - return false; - } - last_was_hyphen = false; - position += utf8_length; - label_has_content = true; - } - - const auto label_length{position - label_start}; - if (label_length == 0 || label_length > 63 || last_was_hyphen) { - return false; - } - - if (position < value.size()) { - position += 1; - if (position == value.size()) { - // RFC 5321 §4.1.2 Domain has no trailing dot - return false; - } - } - } - - return true; -} - } // namespace #endif diff --git a/test/dns/CMakeLists.txt b/test/dns/CMakeLists.txt index e15d808fc..f43ed9554 100644 --- a/test/dns/CMakeLists.txt +++ b/test/dns/CMakeLists.txt @@ -1,5 +1,5 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME dns - SOURCES hostname_test.cc) + SOURCES hostname_test.cc idn_hostname_test.cc) target_link_libraries(sourcemeta_core_dns_unit PRIVATE sourcemeta::core::dns) diff --git a/test/dns/hostname_test.cc b/test/dns/hostname_test.cc index 66ad19586..f92b9b476 100644 --- a/test/dns/hostname_test.cc +++ b/test/dns/hostname_test.cc @@ -7,66 +7,79 @@ // RFC 952 §B: ::= *["."] (three labels, TS d7+ #7) TEST(DNS_hostname, valid_simple_dotted) { EXPECT_TRUE(sourcemeta::core::is_hostname("www.example.com")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("www.example.com")); } // RFC 952 §B: allows a single (TS d7+ #8) TEST(DNS_hostname, valid_single_label) { EXPECT_TRUE(sourcemeta::core::is_hostname("hostname")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("hostname")); } // RFC 952 §B: interior digits are let-dig-hyp (TS d7+ #9) TEST(DNS_hostname, valid_single_label_with_digits) { EXPECT_TRUE(sourcemeta::core::is_hostname("h0stn4me")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("h0stn4me")); } // RFC 1123 §2.1: first character may be a digit (TS d7+ #10) TEST(DNS_hostname, valid_starts_with_digit) { EXPECT_TRUE(sourcemeta::core::is_hostname("1host")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("1host")); } // RFC 952 §B: ends with (TS d7+ #11) TEST(DNS_hostname, valid_ends_with_digit) { EXPECT_TRUE(sourcemeta::core::is_hostname("hostnam3")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("hostnam3")); } // RFC 952 §B: interior '-' is let-dig-hyp (TS d7+ #17) TEST(DNS_hostname, valid_interior_hyphen) { EXPECT_TRUE(sourcemeta::core::is_hostname("host-name")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("host-name")); } // RFC 952 §B: two minimal single-character labels TEST(DNS_hostname, valid_two_labels) { EXPECT_TRUE(sourcemeta::core::is_hostname("a.b")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("a.b")); } // RFC 952 §B: single is a valid TEST(DNS_hostname, valid_single_letter) { EXPECT_TRUE(sourcemeta::core::is_hostname("a")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("a")); } // RFC 1123 §2.1: single digit is a valid label TEST(DNS_hostname, valid_single_digit) { EXPECT_TRUE(sourcemeta::core::is_hostname("0")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("0")); } // RFC 952 ASSUMPTIONS: no distinction between upper and lower case TEST(DNS_hostname, valid_single_uppercase) { EXPECT_TRUE(sourcemeta::core::is_hostname("A")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("A")); } // RFC 952 ASSUMPTIONS: case-insensitive — mixed case is valid TEST(DNS_hostname, valid_mixed_case) { EXPECT_TRUE(sourcemeta::core::is_hostname("HosT.CoM")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("HosT.CoM")); } // RFC 1123 §2.1 MUST: label of exactly 63 chars (TS d4 #17, TS d7+ #23) TEST(DNS_hostname, valid_label_exactly_63) { EXPECT_TRUE(sourcemeta::core::is_hostname(std::string(63, 'a') + ".com")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname(std::string(63, 'a') + ".com")); } // RFC 1123 §2.1 MUST: single label of exactly 63 chars TEST(DNS_hostname, valid_single_label_63) { EXPECT_TRUE(sourcemeta::core::is_hostname(std::string(63, 'a'))); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname(std::string(63, 'a'))); } // RFC 1123 §2.1 SHOULD: 253-byte total is under the 255-char cap @@ -75,6 +88,9 @@ TEST(DNS_hostname, valid_total_253) { EXPECT_TRUE(sourcemeta::core::is_hostname( std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(61, 'a'))); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname( + std::string(63, 'a') + "." + std::string(63, 'a') + "." + + std::string(63, 'a') + "." + std::string(61, 'a'))); } // RFC 1035 §2.3.4 via RFC 1123 §2.1: 254-byte hostname is under the 255 cap @@ -83,6 +99,9 @@ TEST(DNS_hostname, valid_total_254) { EXPECT_TRUE(sourcemeta::core::is_hostname( std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(62, 'a'))); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname( + std::string(63, 'a') + "." + std::string(63, 'a') + "." + + std::string(63, 'a') + "." + std::string(62, 'a'))); } // RFC 1035 §2.3.4 via RFC 1123 §2.1: exactly at the 255-char SHOULD limit @@ -91,118 +110,144 @@ TEST(DNS_hostname, valid_total_255) { EXPECT_TRUE(sourcemeta::core::is_hostname( std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(63, 'a'))); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname( + std::string(63, 'a') + "." + std::string(63, 'a') + "." + + std::string(63, 'a') + "." + std::string(63, 'a'))); } // RFC 952 §B: xn-- labels are plain ASCII; grammar accepts (TS d4 #8) TEST(DNS_hostname, valid_punycoded_draft4) { EXPECT_TRUE(sourcemeta::core::is_hostname("xn--4gbwdl.xn--wgbh1c")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("xn--4gbwdl.xn--wgbh1c")); } // RFC 1123 §2.1: no positions-3-4 rule exists; we accept (documented TS d7+ // #20 divergence — test cites RFC 5891 §4.2.3.1, which is IDNA2008 only) TEST(DNS_hostname, valid_xn_positions_34_both_hyphen) { EXPECT_TRUE(sourcemeta::core::is_hostname("XN--aa---o47jg78q")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("XN--aa---o47jg78q")); } // RFC 952 §B: grammar has no rule against consecutive interior hyphens TEST(DNS_hostname, valid_consecutive_interior_hyphens) { EXPECT_TRUE(sourcemeta::core::is_hostname("a--b")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("a--b")); } // RFC 1123 §2.1 DISCUSSION: numeric TLD is not forbidden by grammar TEST(DNS_hostname, valid_numeric_tld) { EXPECT_TRUE(sourcemeta::core::is_hostname("example.123")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("example.123")); } // RFC 952 §B: any number of labels separated by dots TEST(DNS_hostname, valid_many_labels) { EXPECT_TRUE(sourcemeta::core::is_hostname("a.b.c.d.e.f")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("a.b.c.d.e.f")); } // RFC 952 §B: requires at least one / label (TS d7+ #12) TEST(DNS_hostname, invalid_empty) { EXPECT_FALSE(sourcemeta::core::is_hostname("")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("")); } // RFC 952 §B: bare '.' has no before or after (TS d7+ #13) TEST(DNS_hostname, invalid_single_dot) { EXPECT_FALSE(sourcemeta::core::is_hostname(".")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(".")); } // RFC 952 §B: leading '.' yields an empty label (TS d7+ #14) TEST(DNS_hostname, invalid_leading_dot) { EXPECT_FALSE(sourcemeta::core::is_hostname(".example")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(".example")); } // JSON Schema test suite: trailing dot is invalid from draft 4 onward (TS d7+ // #15) TEST(DNS_hostname, invalid_trailing_dot) { EXPECT_FALSE(sourcemeta::core::is_hostname("example.")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("example.")); } // JSON Schema test suite: trailing dot generalised to single label TEST(DNS_hostname, invalid_trailing_dot_single) { EXPECT_FALSE(sourcemeta::core::is_hostname("host.")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("host.")); } // RFC 952 §B: double dot yields an empty label TEST(DNS_hostname, invalid_double_dot) { EXPECT_FALSE(sourcemeta::core::is_hostname("example..com")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("example..com")); } // RFC 952 §B: double trailing dot yields empty labels TEST(DNS_hostname, invalid_double_trailing_dot) { EXPECT_FALSE(sourcemeta::core::is_hostname("example..")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("example..")); } // RFC 1123 §2.1: first char of label must be letter or digit (TS d7+ #18) TEST(DNS_hostname, invalid_label_starts_with_hyphen) { EXPECT_FALSE(sourcemeta::core::is_hostname("-hostname")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("-hostname")); } // RFC 952 §B + ASSUMPTIONS: last char must not be minus sign (TS d7+ #19) TEST(DNS_hostname, invalid_label_ends_with_hyphen) { EXPECT_FALSE(sourcemeta::core::is_hostname("hostname-")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("hostname-")); } // RFC 1123 §2.1: first-char rule applies to every label, not just the first TEST(DNS_hostname, invalid_middle_label_starts_with_hyphen) { EXPECT_FALSE(sourcemeta::core::is_hostname("a.-b.c")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("a.-b.c")); } // RFC 952 §B: final-char rule applies to every label, not just the last TEST(DNS_hostname, invalid_middle_label_ends_with_hyphen) { EXPECT_FALSE(sourcemeta::core::is_hostname("a.b-.c")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("a.b-.c")); } // RFC 952 ASSUMPTIONS: underscore is not in the alphabet (TS d4 #14) TEST(DNS_hostname, invalid_underscore_start) { EXPECT_FALSE(sourcemeta::core::is_hostname("_hostname")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("_hostname")); } // RFC 952 ASSUMPTIONS: underscore is not in the alphabet (TS d4 #15) TEST(DNS_hostname, invalid_underscore_end) { EXPECT_FALSE(sourcemeta::core::is_hostname("hostname_")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("hostname_")); } // RFC 952 ASSUMPTIONS: underscore is not in the alphabet (TS d4 #16, d7+ #21) TEST(DNS_hostname, invalid_underscore_middle) { EXPECT_FALSE(sourcemeta::core::is_hostname("host_name")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("host_name")); } // RFC 952 ASSUMPTIONS: no blank or space characters are permitted TEST(DNS_hostname, invalid_space) { EXPECT_FALSE(sourcemeta::core::is_hostname("host name")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("host name")); } // RFC 1123 §2.1 MUST: label exceeds 63-character limit (TS d4 #18, d7+ #24) TEST(DNS_hostname, invalid_label_64) { EXPECT_FALSE(sourcemeta::core::is_hostname(std::string(64, 'a') + ".com")); + EXPECT_FALSE( + sourcemeta::core::is_idn_hostname(std::string(64, 'a') + ".com")); } // RFC 1123 §2.1 MUST: single label of 64 chars exceeds per-label limit TEST(DNS_hostname, invalid_single_label_64) { EXPECT_FALSE(sourcemeta::core::is_hostname(std::string(64, 'a'))); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(std::string(64, 'a'))); } // RFC 1035 §2.3.4 via RFC 1123 §2.1: total length exceeds 255 (constructed) @@ -212,6 +257,9 @@ TEST(DNS_hostname, invalid_total_256) { EXPECT_FALSE(sourcemeta::core::is_hostname( std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(63, 'a') + "." + std::string(62, 'a') + ".a")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname( + std::string(63, 'a') + "." + std::string(63, 'a') + "." + + std::string(63, 'a') + "." + std::string(62, 'a') + ".a")); } // RFC 1035 §2.3.4 via RFC 1123 §2.1: exact TS d7+ #22 input (259 bytes, @@ -223,13 +271,25 @@ TEST(DNS_hostname, invalid_ts_256_string) { "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk." "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk." "com")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname( + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk." + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk." + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk." + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk." + "com")); } // RFC 952 ASSUMPTIONS: U+FF0E (fullwidth full stop) is not in the ASCII // alphabet; UTF-8 bytes 0xEF 0xBC 0x8E (TS d4 #27, d7+ #16) TEST(DNS_hostname, invalid_fullwidth_dot) { + // RFC 1123 §2.1: U+FF0E (FULLWIDTH FULL STOP) is not the ASCII label + // separator. is_idn_hostname accepts it as a normal UTF-8 codepoint within + // a label per RFC 5890 §2.3.2.3 (best-effort lexical handling); strict + // IDNA2008 would map and reject, but that is out of scope here EXPECT_FALSE(sourcemeta::core::is_hostname("example\xef\xbc\x8e" "com")); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("example\xef\xbc\x8e" + "com")); } // RFC 952 ASSUMPTIONS: any byte >= 0x80 is outside the ASCII alphabet @@ -237,6 +297,9 @@ TEST(DNS_hostname, invalid_high_bit_byte) { EXPECT_FALSE(sourcemeta::core::is_hostname(std::string_view{"a\x80" "b", 3})); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(std::string_view{"a\x80" + "b", + 3})); } // RFC 952 ASSUMPTIONS: NUL byte (0x00) is not in the ASCII alphabet @@ -244,9 +307,13 @@ TEST(DNS_hostname, invalid_nul_byte) { EXPECT_FALSE(sourcemeta::core::is_hostname(std::string_view{"a\x00" "b", 3})); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(std::string_view{"a\x00" + "b", + 3})); } // RFC 952 ASSUMPTIONS: '@' is not in the alphabet (A-Z 0-9 '-' '.') TEST(DNS_hostname, invalid_at_sign) { EXPECT_FALSE(sourcemeta::core::is_hostname("user@host")); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("user@host")); } diff --git a/test/dns/idn_hostname_test.cc b/test/dns/idn_hostname_test.cc new file mode 100644 index 000000000..e374dffce --- /dev/null +++ b/test/dns/idn_hostname_test.cc @@ -0,0 +1,243 @@ +#include + +#include + +#include + +// example.test rendered in Hangul (RFC 5890 §2.3.2.3) +// Bytes: 실=EC8BA4 례=EBA180 .=2E 테=ED858C 스=EC8AA4 트=ED8AB8 +TEST(DNS_idn_hostname, valid_hangul_example_test) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname( + "\xec\x8b\xa4\xeb\xa1\x80" + ".\xed\x85\x8c\xec\x8a\xa4\xed\x8a\xb8")); + EXPECT_FALSE( + sourcemeta::core::is_hostname("\xec\x8b\xa4\xeb\xa1\x80" + ".\xed\x85\x8c\xec\x8a\xa4\xed\x8a\xb8")); +} + +// RFC 1123 §2.1: ASCII single label is a subset of the extended grammar +TEST(DNS_idn_hostname, valid_ascii_single_label) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("hostname")); + EXPECT_TRUE(sourcemeta::core::is_hostname("hostname")); +} + +TEST(DNS_idn_hostname, valid_ascii_dotted) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("www.example.com")); + EXPECT_TRUE(sourcemeta::core::is_hostname("www.example.com")); +} + +TEST(DNS_idn_hostname, valid_ascii_with_hyphen) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("host-name")); + EXPECT_TRUE(sourcemeta::core::is_hostname("host-name")); +} + +TEST(DNS_idn_hostname, valid_ascii_with_digits) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("h0stn4me")); + EXPECT_TRUE(sourcemeta::core::is_hostname("h0stn4me")); +} + +TEST(DNS_idn_hostname, valid_ascii_leading_digit) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("1host")); + EXPECT_TRUE(sourcemeta::core::is_hostname("1host")); +} + +TEST(DNS_idn_hostname, valid_ascii_trailing_digit) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("hostnam3")); + EXPECT_TRUE(sourcemeta::core::is_hostname("hostnam3")); +} + +// RFC 5890 §2.3.2.1: A-label form xn--... (purely ASCII LDH) +TEST(DNS_idn_hostname, valid_ascii_a_label_form) { + EXPECT_TRUE( + sourcemeta::core::is_idn_hostname("xn--ihqwcrb4cv8a8dqg056pqjye")); + EXPECT_TRUE(sourcemeta::core::is_hostname("xn--ihqwcrb4cv8a8dqg056pqjye")); +} + +// RFC 5890 §2.3.2.3 / RFC 6532 §3.1: U-label (2-byte UTF-8: U+03B1) +TEST(DNS_idn_hostname, valid_label_two_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("\xce\xb1")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xce\xb1")); +} + +// RFC 5890 §2.3.2.3 / RFC 6532 §3.1: U-label (3-byte UTF-8: U+4E2D 中) +TEST(DNS_idn_hostname, valid_label_three_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("\xe4\xb8\xad")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xe4\xb8\xad")); +} + +// RFC 5890 §2.3.2.3 / RFC 6532 §3.1: U-label (4-byte UTF-8: U+1F600) +TEST(DNS_idn_hostname, valid_label_four_byte_utf8) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("\xf0\x9f\x98\x80")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xf0\x9f\x98\x80")); +} + +TEST(DNS_idn_hostname, valid_mixed_ascii_and_utf8_labels) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("example.\xce\xb1.com")); + EXPECT_FALSE(sourcemeta::core::is_hostname("example.\xce\xb1.com")); +} + +TEST(DNS_idn_hostname, valid_utf8_label_with_internal_hyphen) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname("\xce\xb1-\xe4\xb8\xad")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xce\xb1-\xe4\xb8\xad")); +} + +TEST(DNS_idn_hostname, valid_utf8_only_dotted) { + EXPECT_TRUE(sourcemeta::core::is_idn_hostname( + "\xce\xb1.\xe4\xb8\xad.\xf0\x9f\x98\x80")); + EXPECT_FALSE( + sourcemeta::core::is_hostname("\xce\xb1.\xe4\xb8\xad.\xf0\x9f\x98\x80")); +} + +// RFC 1035 §2.3.4: label may be exactly 63 octets +TEST(DNS_idn_hostname, valid_ascii_label_at_63) { + const std::string label(63, 'a'); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname(label)); + EXPECT_TRUE(sourcemeta::core::is_hostname(label)); +} + +// RFC 5321 §4.5.3.1.2 / RFC 1035 §2.3.4: total length may be exactly 255 +TEST(DNS_idn_hostname, valid_total_at_255) { + // 4 × 63 + 3 dots = 255 + std::string hostname; + for (int index = 0; index < 3; ++index) { + hostname.append(63, 'a'); + hostname.push_back('.'); + } + hostname.append(63, 'a'); + EXPECT_EQ(hostname.size(), 255u); + EXPECT_TRUE(sourcemeta::core::is_idn_hostname(hostname)); + EXPECT_TRUE(sourcemeta::core::is_hostname(hostname)); +} + +TEST(DNS_idn_hostname, invalid_empty) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("")); + EXPECT_FALSE(sourcemeta::core::is_hostname("")); +} + +// RFC 1123 §2.1: a label may not begin with a hyphen +TEST(DNS_idn_hostname, invalid_leading_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("-hello")); + EXPECT_FALSE(sourcemeta::core::is_hostname("-hello")); +} + +// RFC 952 §B: a label may not end with a hyphen +TEST(DNS_idn_hostname, invalid_trailing_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("hello-")); + EXPECT_FALSE(sourcemeta::core::is_hostname("hello-")); +} + +TEST(DNS_idn_hostname, invalid_leading_and_trailing_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("-hello-")); + EXPECT_FALSE(sourcemeta::core::is_hostname("-hello-")); +} + +TEST(DNS_idn_hostname, invalid_utf8_label_trailing_hyphen) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xce\xb1-")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xce\xb1-")); +} + +TEST(DNS_idn_hostname, invalid_leading_dot) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(".example")); + EXPECT_FALSE(sourcemeta::core::is_hostname(".example")); +} + +TEST(DNS_idn_hostname, invalid_trailing_dot) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("example.")); + EXPECT_FALSE(sourcemeta::core::is_hostname("example.")); +} + +TEST(DNS_idn_hostname, invalid_consecutive_dots) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("a..b")); + EXPECT_FALSE(sourcemeta::core::is_hostname("a..b")); +} + +// Space and "$" are neither ASCII LDH nor UTF-8 lead bytes +TEST(DNS_idn_hostname, invalid_dollar_amount_with_arrows) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("-> $1.00 <--")); + EXPECT_FALSE(sourcemeta::core::is_hostname("-> $1.00 <--")); +} + +TEST(DNS_idn_hostname, invalid_internal_space) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("a b")); + EXPECT_FALSE(sourcemeta::core::is_hostname("a b")); +} + +TEST(DNS_idn_hostname, invalid_underscore) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("a_b")); + EXPECT_FALSE(sourcemeta::core::is_hostname("a_b")); +} + +// RFC 1035 §2.3.4: 64-octet label exceeds the per-label cap +TEST(DNS_idn_hostname, invalid_ascii_label_at_64) { + const std::string label(64, 'a'); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(label)); + EXPECT_FALSE(sourcemeta::core::is_hostname(label)); +} + +// RFC 5321 §4.5.3.1.2: 256-octet total exceeds the cap. Construction avoids +// trailing-dot and per-label confounds (see idn_email_test.cc rationale) +TEST(DNS_idn_hostname, invalid_total_at_256) { + std::string hostname; + for (int index = 0; index < 4; ++index) { + hostname.append(51, 'a'); + hostname.push_back('.'); + } + hostname.append(48, 'a'); + EXPECT_EQ(hostname.size(), 256u); + EXPECT_NE(hostname.back(), '.'); + EXPECT_FALSE(sourcemeta::core::is_idn_hostname(hostname)); + EXPECT_FALSE(sourcemeta::core::is_hostname(hostname)); +} + +// RFC 6532 §3.1: lone continuation byte (0xBF) cannot start a UTF-8 sequence +TEST(DNS_idn_hostname, invalid_lone_continuation_byte) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xbf")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xbf")); +} + +// RFC 6532 §3.1: 2-byte starter without a continuation byte +TEST(DNS_idn_hostname, invalid_truncated_two_byte) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xce")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xce")); +} + +// RFC 6532 §3.1: %xE0 %x80-9F is an overlong 3-byte encoding +TEST(DNS_idn_hostname, invalid_overlong_three_byte) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xe0\x80\xa0")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xe0\x80\xa0")); +} + +// RFC 6532 §3.1: U+D800 surrogate codepoint is forbidden +TEST(DNS_idn_hostname, invalid_surrogate_codepoint) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xed\xa0\x80")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xed\xa0\x80")); +} + +// RFC 6532 §3.1: codepoints above U+10FFFF are forbidden +TEST(DNS_idn_hostname, invalid_above_max_codepoint) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xf4\x90\x80\x80")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xf4\x90\x80\x80")); +} + +// RFC 6532 §3.1: 4-byte starter with truncated continuation +TEST(DNS_idn_hostname, invalid_truncated_four_byte) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xf0\x9f\x98")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xf0\x9f\x98")); +} + +// RFC 6532 §3.1: %xC0 is a forbidden lead byte (overlong U+0000) +TEST(DNS_idn_hostname, invalid_overlong_c0) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xc0\x80")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xc0\x80")); +} + +// RFC 6532 §3.1: %xF5 is not a valid lead byte +TEST(DNS_idn_hostname, invalid_lead_f5) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("\xf5\x80\x80\x80")); + EXPECT_FALSE(sourcemeta::core::is_hostname("\xf5\x80\x80\x80")); +} + +TEST(DNS_idn_hostname, invalid_invalid_utf8_in_middle_label) { + EXPECT_FALSE(sourcemeta::core::is_idn_hostname("a.\xc0\x80.b")); + EXPECT_FALSE(sourcemeta::core::is_hostname("a.\xc0\x80.b")); +}