From 9e07e3e137952720673e2b0613e57fc9d43e692a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Mar 2026 08:45:36 -0500 Subject: [PATCH 1/2] Add host device marker to digit counting of u32 and u64 --- include/boost/charconv/detail/integer_search_trees.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/boost/charconv/detail/integer_search_trees.hpp b/include/boost/charconv/detail/integer_search_trees.hpp index 52e8ec12..ece632b0 100644 --- a/include/boost/charconv/detail/integer_search_trees.hpp +++ b/include/boost/charconv/detail/integer_search_trees.hpp @@ -32,7 +32,7 @@ BOOST_CHARCONV_CXX14_CONSTEXPR int num_digits(T x) noexcept } template <> -BOOST_CHARCONV_CXX14_CONSTEXPR int num_digits(std::uint32_t x) noexcept +BOOST_CHARCONV_HOST_DEVICE BOOST_CHARCONV_CXX14_CONSTEXPR int num_digits(std::uint32_t x) noexcept { if (x >= UINT32_C(10000)) { @@ -76,7 +76,7 @@ BOOST_CHARCONV_CXX14_CONSTEXPR int num_digits(std::uint32_t x) noexcept } template <> -BOOST_CHARCONV_CXX14_CONSTEXPR int num_digits(std::uint64_t x) noexcept +BOOST_CHARCONV_HOST_DEVICE BOOST_CHARCONV_CXX14_CONSTEXPR int num_digits(std::uint64_t x) noexcept { if (x >= UINT64_C(10000000000)) { From 9d98c61565bcd6a18dc695254d506b964f9b9713 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Tue, 17 Mar 2026 08:45:48 -0500 Subject: [PATCH 2/2] Add testing of device digit counting --- test/cuda_jamfile | 3 + test/test_num_digits_uint32.cu | 128 ++++++++++++++++++++++++++++ test/test_num_digits_uint64.cu | 148 +++++++++++++++++++++++++++++++++ 3 files changed, 279 insertions(+) create mode 100644 test/test_num_digits_uint32.cu create mode 100644 test/test_num_digits_uint64.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 3ea223f1..e0dfe07c 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -56,3 +56,6 @@ run test_to_chars_bases_long.cu ; run test_to_chars_bases_unsigned_long.cu ; run test_to_chars_bases_long_long.cu ; run test_to_chars_bases_unsigned_long_long.cu ; + +run test_num_digits_uint32.cu ; +run test_num_digits_uint64.cu ; diff --git a/test/test_num_digits_uint32.cu b/test/test_num_digits_uint32.cu new file mode 100644 index 00000000..831aca09 --- /dev/null +++ b/test/test_num_digits_uint32.cu @@ -0,0 +1,128 @@ +// Copyright Matt Borland 2024 - 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +using test_type = std::uint32_t; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::charconv::detail::num_digits(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + std::mt19937_64 rng {42}; + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors with random values across the full range + std::uniform_int_distribution dist {1, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + } + + // Also test boundary values at specific digit counts + // 1-digit: 1-9, 2-digit: 10-99, ..., 10-digit: 1000000000-4294967295 + test_type boundaries[] = { + UINT32_C(0), + UINT32_C(1), + UINT32_C(9), + UINT32_C(10), + UINT32_C(99), + UINT32_C(100), + UINT32_C(999), + UINT32_C(1000), + UINT32_C(9999), + UINT32_C(10000), + UINT32_C(99999), + UINT32_C(100000), + UINT32_C(999999), + UINT32_C(1000000), + UINT32_C(9999999), + UINT32_C(10000000), + UINT32_C(99999999), + UINT32_C(100000000), + UINT32_C(999999999), + UINT32_C(1000000000), + UINT32_C(4294967295) + }; + int num_boundaries = sizeof(boundaries) / sizeof(boundaries[0]); + for (int i = 0; i < num_boundaries && i < numElements; ++i) + { + input_vector[i] = boundaries[i]; + } + + // Launch the CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + w.reset(); + for(int i = 0; i < numElements; ++i) + { + int cpu_result = boost::charconv::detail::num_digits(input_vector[i]); + if (output_vector[i] != cpu_result) + { + std::cerr << "Result verification failed at element " << i + << ": input=" << input_vector[i] + << " gpu=" << output_vector[i] + << " cpu=" << cpu_result << "!" << std::endl; + return EXIT_FAILURE; + } + } + double t = w.elapsed(); + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_num_digits_uint64.cu b/test/test_num_digits_uint64.cu new file mode 100644 index 00000000..7e33a038 --- /dev/null +++ b/test/test_num_digits_uint64.cu @@ -0,0 +1,148 @@ +// Copyright Matt Borland 2024 - 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +using test_type = std::uint64_t; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::charconv::detail::num_digits(in[i]); + } +} + +/** + * Host main routine + */ +int main(void) +{ + std::mt19937_64 rng {42}; + + // Error code to check return values for CUDA calls + cudaError_t err = cudaSuccess; + + // Print the vector length to be used, and compute its size + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + // Allocate the managed input vector + cuda_managed_ptr input_vector(numElements); + + // Allocate the managed output vector + cuda_managed_ptr output_vector(numElements); + + // Initialize the input vectors with random values across the full range + std::uniform_int_distribution dist {1, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + } + + // Also test boundary values at specific digit counts + // 1-digit: 1-9, 2-digit: 10-99, ..., 20-digit: 10000000000000000000-18446744073709551615 + test_type boundaries[] = { + UINT64_C(0), + UINT64_C(1), + UINT64_C(9), + UINT64_C(10), + UINT64_C(99), + UINT64_C(100), + UINT64_C(999), + UINT64_C(1000), + UINT64_C(9999), + UINT64_C(10000), + UINT64_C(99999), + UINT64_C(100000), + UINT64_C(999999), + UINT64_C(1000000), + UINT64_C(9999999), + UINT64_C(10000000), + UINT64_C(99999999), + UINT64_C(100000000), + UINT64_C(999999999), + UINT64_C(1000000000), + UINT64_C(9999999999), + UINT64_C(10000000000), + UINT64_C(99999999999), + UINT64_C(100000000000), + UINT64_C(999999999999), + UINT64_C(1000000000000), + UINT64_C(9999999999999), + UINT64_C(10000000000000), + UINT64_C(99999999999999), + UINT64_C(100000000000000), + UINT64_C(999999999999999), + UINT64_C(1000000000000000), + UINT64_C(9999999999999999), + UINT64_C(10000000000000000), + UINT64_C(99999999999999999), + UINT64_C(100000000000000000), + UINT64_C(999999999999999999), + UINT64_C(1000000000000000000), + UINT64_C(9999999999999999999), + UINT64_C(10000000000000000000), + UINT64_C(18446744073709551615) + }; + int num_boundaries = sizeof(boundaries) / sizeof(boundaries[0]); + for (int i = 0; i < num_boundaries && i < numElements; ++i) + { + input_vector[i] = boundaries[i]; + } + + // Launch the CUDA Kernel + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + // Verify that the result vector is correct + w.reset(); + for(int i = 0; i < numElements; ++i) + { + int cpu_result = boost::charconv::detail::num_digits(input_vector[i]); + if (output_vector[i] != cpu_result) + { + std::cerr << "Result verification failed at element " << i + << ": input=" << input_vector[i] + << " gpu=" << output_vector[i] + << " cpu=" << cpu_result << "!" << std::endl; + return EXIT_FAILURE; + } + } + double t = w.elapsed(); + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +}