Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS})
elseif(component STREQUAL "ip")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake")
elseif(component STREQUAL "dns")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_dns.cmake")
elseif(component STREQUAL "email")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake")
Expand Down
3 changes: 3 additions & 0 deletions src/core/dns/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME dns
if(SOURCEMETA_CORE_INSTALL)
sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME dns)
endif()

target_link_libraries(sourcemeta_core_dns
PRIVATE sourcemeta::core::unicode)
77 changes: 50 additions & 27 deletions src/core/dns/hostname.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <sourcemeta/core/dns.h>

#include <sourcemeta/core/unicode.h>

namespace sourcemeta::core {

// RFC 952 §B: let-dig = ALPHA / DIGIT
Expand All @@ -10,12 +12,10 @@ static constexpr auto is_let_dig(const char character) -> bool {
(character >= '0' && character <= '9');
}

// RFC 952 §B: let-dig-hyp = ALPHA / DIGIT / "-"
static constexpr auto is_let_dig_hyp(const char character) -> bool {
return is_let_dig(character) || character == '-';
}

auto is_hostname(const std::string_view value) -> bool {
// RFC 1123 §2.1: hostname grammar. When AllowUtf8 is true, RFC 5890 §2.3.2.3
// extends each label with UTF-8 non-ASCII bytes (RFC 6532 §3.1)
template <bool AllowUtf8>
static auto is_hostname_impl(const std::string_view value) -> bool {
// RFC 952 §B: <hname> requires at least one <name>
if (value.empty()) {
return false;
Expand All @@ -27,42 +27,57 @@ auto is_hostname(const std::string_view value) -> bool {
}

std::string_view::size_type position{0};

while (position < value.size()) {
const auto label_start{position};

// RFC 1123 §2.1: first character is letter or digit
if (!is_let_dig(value[position])) {
return false;
}
position += 1;
bool last_was_hyphen{false};
bool label_has_content{false};

while (position < value.size() && value[position] != '.') {
// RFC 952 §B: interior characters are let-dig-hyp
if (!is_let_dig_hyp(value[position])) {
return false;
const auto character{value[position]};
if (character == '-') {
// RFC 1123 §2.1: first character must be let-dig, never hyphen
if (!label_has_content) {
return false;
}
last_was_hyphen = true;
position += 1;
label_has_content = true;
continue;
}
position += 1;
}

const auto label_length{position - label_start};
if (is_let_dig(character)) {
last_was_hyphen = false;
position += 1;
label_has_content = true;
continue;
}

// RFC 1123 §2.1: MUST handle host names of up to 63 characters (per label)
if (label_length > 63) {
return false;
if constexpr (AllowUtf8) {
// RFC 5890 §2.3.2.3 / RFC 6532 §3.1: UTF-8 non-ASCII codepoint as a
// U-label byte
const auto utf8_length{utf8_codepoint_length(value, position)};
if (utf8_length < 2) {
return false;
}
last_was_hyphen = false;
position += utf8_length;
label_has_content = true;
} else {
return false;
}
}

// RFC 952 §B + ASSUMPTIONS: last character must not be a minus sign
if (value[position - 1] == '-') {
// RFC 1035 §2.3.4: per-label cap is 63 octets
const auto label_length{position - label_start};
if (label_length == 0 || label_length > 63 || last_was_hyphen) {
return false;
}

// If we stopped on a dot, there must be another label following it
if (position < value.size()) {
// value[position] == '.'
position += 1;
// Trailing dot: JSON Schema test suite requires rejection (TS d7+ #15)
if (position >= value.size()) {
if (position == value.size()) {
// Trailing dot is not part of the host name grammar
return false;
}
}
Expand All @@ -71,4 +86,12 @@ auto is_hostname(const std::string_view value) -> bool {
return true;
}

auto is_hostname(const std::string_view value) -> bool {
return is_hostname_impl<false>(value);
}

auto is_idn_hostname(const std::string_view value) -> bool {
return is_hostname_impl<true>(value);
}

} // namespace sourcemeta::core
32 changes: 28 additions & 4 deletions src/core/dns/include/sourcemeta/core/dns.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ namespace sourcemeta::core {
/// @ingroup dns
/// Check whether the given string is a valid Internet host name per
/// RFC 1123 Section 2.1, which relaxes the first-character rule of
/// RFC 952 to allow either a letter or a digit. This matches the
/// definition used by the JSON Schema `hostname` format. For example:
/// RFC 952 to allow either a letter or a digit. For example:
///
/// ```cpp
/// #include <sourcemeta/core/dns.h>
Expand All @@ -36,11 +35,36 @@ namespace sourcemeta::core {
/// ```
///
/// This function implements RFC 1123 §2.1 (ASCII only). It does not
/// perform A-label or Punycode decoding. Those belong to the separate
/// `idn-hostname` format.
/// perform A-label or Punycode decoding. For internationalized host
/// names see `is_idn_hostname`.
SOURCEMETA_CORE_DNS_EXPORT
auto is_hostname(const std::string_view value) -> bool;

/// @ingroup dns
/// Check whether the given string is a valid internationalized host name.
/// Accepts every input that `is_hostname` accepts, and additionally allows
/// each label to contain valid UTF-8 non-ASCII byte sequences (RFC 6532
/// Section 3.1), modelling the U-label extension of RFC 5890 Section
/// 2.3.2.3. For example:
///
/// ```cpp
/// #include <sourcemeta/core/dns.h>
///
/// #include <cassert>
///
/// assert(sourcemeta::core::is_idn_hostname("www.example.com"));
/// assert(sourcemeta::core::is_idn_hostname(
/// "\xec\x8b\xa4\xeb\xa1\x80.\xed\x85\x8c\xec\x8a\xa4\xed\x8a\xb8"));
/// assert(!sourcemeta::core::is_idn_hostname("-bad"));
/// ```
///
/// This is a best-effort lexical check: it accepts the byte-level structure
/// of an RFC 5890 U-label but does not perform full IDNA2008 validation
/// (no NFC normalization, no Bidi rule, no ContextJ/O checks, no Punycode
/// round-trip).
SOURCEMETA_CORE_DNS_EXPORT
auto is_idn_hostname(const std::string_view value) -> bool;

} // namespace sourcemeta::core

#endif
3 changes: 2 additions & 1 deletion src/core/email/email.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <sourcemeta/core/email.h>

#include <sourcemeta/core/dns.h>
#include <sourcemeta/core/unicode.h>

#include "helpers.h"

Expand Down Expand Up @@ -115,7 +116,7 @@ static auto is_mailbox(const std::string_view value) -> bool {

if constexpr (AllowUtf8) {
// RFC 6531 §3.3: sub-domain =/ U-label
return is_idn_domain(domain);
return is_idn_hostname(domain);
} else {
// RFC 5321 §4.1.2 Domain matches is_hostname (RFC 1123 §2.1) by
// grammar, by 63-octet label cap (RFC 1035 §2.3.4), and by
Expand Down
64 changes: 0 additions & 64 deletions src/core/email/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#define SOURCEMETA_CORE_EMAIL_HELPERS_H_

#include <sourcemeta/core/ip.h>
#include <sourcemeta/core/unicode.h>

#include <string_view> // std::string_view

Expand Down Expand Up @@ -132,69 +131,6 @@ inline auto is_address_literal(const std::string_view domain) -> bool {
return is_general_address_literal(inner);
}

// TODO: Move to src/core/dns

// RFC 6531 §3.3: sub-domain =/ U-label
// Relaxed sub-domain grammar where each label is a non-empty sequence of
// LetDig / hyphen / UTF8-non-ascii bytes, with no leading or trailing hyphen,
// length limits per RFC 5321 §4.5.3.1.2 and RFC 1035 §2.3.4
inline auto is_idn_domain(const std::string_view value) -> bool {
if (value.empty() || value.size() > 255) {
return false;
}

std::string_view::size_type position{0};
while (position < value.size()) {
const auto label_start{position};
bool last_was_hyphen{false};
bool label_has_content{false};

while (position < value.size() && value[position] != '.') {
const auto character{value[position]};
if (character == '-') {
if (!label_has_content) {
return false;
}
last_was_hyphen = true;
position += 1;
label_has_content = true;
continue;
}

if (is_let_dig(character)) {
last_was_hyphen = false;
position += 1;
label_has_content = true;
continue;
}

const auto utf8_length{
sourcemeta::core::utf8_codepoint_length(value, position)};
if (utf8_length < 2) {
return false;
}
last_was_hyphen = false;
position += utf8_length;
label_has_content = true;
}

const auto label_length{position - label_start};
if (label_length == 0 || label_length > 63 || last_was_hyphen) {
return false;
}

if (position < value.size()) {
position += 1;
if (position == value.size()) {
// RFC 5321 §4.1.2 Domain has no trailing dot
return false;
}
}
}

return true;
}

} // namespace

#endif
2 changes: 1 addition & 1 deletion test/dns/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME dns
SOURCES hostname_test.cc)
SOURCES hostname_test.cc idn_hostname_test.cc)

target_link_libraries(sourcemeta_core_dns_unit
PRIVATE sourcemeta::core::dns)
Loading
Loading