From e29d727b9291704f01405ce1699f355f110fea90 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 09:35:22 -0400 Subject: [PATCH 01/36] Add CUDA support to bit functions --- include/boost/safe_numbers/bit.hpp | 39 +++++++++++++++++++----------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/include/boost/safe_numbers/bit.hpp b/include/boost/safe_numbers/bit.hpp index 77b7843..38c3ce7 100644 --- a/include/boost/safe_numbers/bit.hpp +++ b/include/boost/safe_numbers/bit.hpp @@ -13,13 +13,14 @@ #include #include +#include #endif // BOOST_SAFE_NUMBERS_BUILD_MODULE namespace boost::safe_numbers { BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto has_single_bit(const UnsignedInt x) noexcept -> bool +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto has_single_bit(const UnsignedInt x) noexcept -> bool { using boost::core::has_single_bit; using underlying_type = detail::underlying_type_t; @@ -28,7 +29,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto bit_ceil(const UnsignedInt x) noexcept -> UnsignedInt +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_ceil(const UnsignedInt x) noexcept -> UnsignedInt { using boost::core::bit_ceil; using underlying_type = detail::underlying_type_t; @@ -37,7 +38,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto bit_floor(const UnsignedInt x) noexcept -> UnsignedInt +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_floor(const UnsignedInt x) noexcept -> UnsignedInt { using boost::core::bit_floor; using underlying_type = detail::underlying_type_t; @@ -46,7 +47,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto bit_width(const UnsignedInt x) noexcept -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_width(const UnsignedInt x) noexcept -> int { using boost::core::bit_width; using underlying_type = detail::underlying_type_t; @@ -55,7 +56,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto rotl(const UnsignedInt x, const int s) noexcept -> UnsignedInt +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotl(const UnsignedInt x, const int s) noexcept -> UnsignedInt { using boost::core::rotl; using underlying_type = detail::underlying_type_t; @@ -64,7 +65,7 @@ BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto rotr(const UnsignedInt x, const int s) noexcept -> UnsignedInt +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotr(const UnsignedInt x, const int s) noexcept -> UnsignedInt { using boost::core::rotr; using underlying_type = detail::underlying_type_t; @@ -73,7 +74,7 @@ BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto countl_zero(const UnsignedInt x) noexcept -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_zero(const UnsignedInt x) noexcept -> int { using boost::core::countl_zero; using underlying_type = detail::underlying_type_t; @@ -82,7 +83,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto countl_one(const UnsignedInt x) noexcept -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_one(const UnsignedInt x) noexcept -> int { using boost::core::countl_one; using underlying_type = detail::underlying_type_t; @@ -91,7 +92,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto countr_zero(const UnsignedInt x) noexcept -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_zero(const UnsignedInt x) noexcept -> int { using boost::core::countr_zero; using underlying_type = detail::underlying_type_t; @@ -100,7 +101,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto countr_one(const UnsignedInt x) noexcept -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_one(const UnsignedInt x) noexcept -> int { using boost::core::countr_one; using underlying_type = detail::underlying_type_t; @@ -109,7 +110,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto popcount(const UnsignedInt x) noexcept -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto popcount(const UnsignedInt x) noexcept -> int { using boost::core::popcount; using underlying_type = detail::underlying_type_t; @@ -118,7 +119,7 @@ BOOST_SAFE_NUMBERS_EXPORT template } BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto byteswap(const Int x) noexcept -> Int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(const Int x) noexcept -> Int { using boost::core::byteswap; using underlying_type = detail::underlying_type_t; @@ -149,11 +150,21 @@ consteval auto make_byte_reverse_table() -> std::array return table; } +#if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + inline constexpr auto reverse_table {make_byte_reverse_table()}; +#endif + template -[[nodiscard]] constexpr auto bitswap_impl(UnsignedInt x) noexcept -> UnsignedInt +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bitswap_impl(UnsignedInt x) noexcept -> UnsignedInt { + #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + constexpr auto reverse_table {make_byte_reverse_table()}; + + #endif + if constexpr (sizeof(UnsignedInt) == 1) { return static_cast(reverse_table[static_cast(x)]); @@ -177,7 +188,7 @@ template } // namespace detail BOOST_SAFE_NUMBERS_EXPORT template -[[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int { using underlying_type = detail::underlying_type_t; return static_cast(detail::bitswap_impl(static_cast(x))); From b725ea15a64840f94da924dd0f63254d5de0070b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 09:35:40 -0400 Subject: [PATCH 02/36] Add CUDA testing of bit functions --- test/cuda_jamfile | 77 ++++++++++++++++++++++++ test/test_cuda_u128_bit_ceil.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_bit_floor.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_bit_width.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_bitswap.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_byteswap.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_countl_one.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_countl_zero.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_countr_one.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_countr_zero.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_has_single_bit.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_popcount.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_rotl.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u128_rotr.cu | 87 +++++++++++++++++++++++++++ test/test_cuda_u16_bit_ceil.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_bit_floor.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_bit_width.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_bitswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_byteswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_countl_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_countl_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_countr_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_countr_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_has_single_bit.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_popcount.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_rotl.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u16_rotr.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_bit_ceil.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_bit_floor.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_bit_width.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_bitswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_byteswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_countl_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_countl_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_countr_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_countr_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_has_single_bit.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_popcount.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_rotl.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u32_rotr.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_bit_ceil.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_bit_floor.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_bit_width.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_bitswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_byteswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_countl_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_countl_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_countr_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_countr_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_has_single_bit.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_popcount.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_rotl.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u64_rotr.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_bit_ceil.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_bit_floor.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_bit_width.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_bitswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_byteswap.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_countl_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_countl_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_countr_one.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_countr_zero.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_has_single_bit.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_popcount.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_rotl.cu | 82 +++++++++++++++++++++++++ test/test_cuda_u8_rotr.cu | 82 +++++++++++++++++++++++++ 66 files changed, 5472 insertions(+) create mode 100644 test/test_cuda_u128_bit_ceil.cu create mode 100644 test/test_cuda_u128_bit_floor.cu create mode 100644 test/test_cuda_u128_bit_width.cu create mode 100644 test/test_cuda_u128_bitswap.cu create mode 100644 test/test_cuda_u128_byteswap.cu create mode 100644 test/test_cuda_u128_countl_one.cu create mode 100644 test/test_cuda_u128_countl_zero.cu create mode 100644 test/test_cuda_u128_countr_one.cu create mode 100644 test/test_cuda_u128_countr_zero.cu create mode 100644 test/test_cuda_u128_has_single_bit.cu create mode 100644 test/test_cuda_u128_popcount.cu create mode 100644 test/test_cuda_u128_rotl.cu create mode 100644 test/test_cuda_u128_rotr.cu create mode 100644 test/test_cuda_u16_bit_ceil.cu create mode 100644 test/test_cuda_u16_bit_floor.cu create mode 100644 test/test_cuda_u16_bit_width.cu create mode 100644 test/test_cuda_u16_bitswap.cu create mode 100644 test/test_cuda_u16_byteswap.cu create mode 100644 test/test_cuda_u16_countl_one.cu create mode 100644 test/test_cuda_u16_countl_zero.cu create mode 100644 test/test_cuda_u16_countr_one.cu create mode 100644 test/test_cuda_u16_countr_zero.cu create mode 100644 test/test_cuda_u16_has_single_bit.cu create mode 100644 test/test_cuda_u16_popcount.cu create mode 100644 test/test_cuda_u16_rotl.cu create mode 100644 test/test_cuda_u16_rotr.cu create mode 100644 test/test_cuda_u32_bit_ceil.cu create mode 100644 test/test_cuda_u32_bit_floor.cu create mode 100644 test/test_cuda_u32_bit_width.cu create mode 100644 test/test_cuda_u32_bitswap.cu create mode 100644 test/test_cuda_u32_byteswap.cu create mode 100644 test/test_cuda_u32_countl_one.cu create mode 100644 test/test_cuda_u32_countl_zero.cu create mode 100644 test/test_cuda_u32_countr_one.cu create mode 100644 test/test_cuda_u32_countr_zero.cu create mode 100644 test/test_cuda_u32_has_single_bit.cu create mode 100644 test/test_cuda_u32_popcount.cu create mode 100644 test/test_cuda_u32_rotl.cu create mode 100644 test/test_cuda_u32_rotr.cu create mode 100644 test/test_cuda_u64_bit_ceil.cu create mode 100644 test/test_cuda_u64_bit_floor.cu create mode 100644 test/test_cuda_u64_bit_width.cu create mode 100644 test/test_cuda_u64_bitswap.cu create mode 100644 test/test_cuda_u64_byteswap.cu create mode 100644 test/test_cuda_u64_countl_one.cu create mode 100644 test/test_cuda_u64_countl_zero.cu create mode 100644 test/test_cuda_u64_countr_one.cu create mode 100644 test/test_cuda_u64_countr_zero.cu create mode 100644 test/test_cuda_u64_has_single_bit.cu create mode 100644 test/test_cuda_u64_popcount.cu create mode 100644 test/test_cuda_u64_rotl.cu create mode 100644 test/test_cuda_u64_rotr.cu create mode 100644 test/test_cuda_u8_bit_ceil.cu create mode 100644 test/test_cuda_u8_bit_floor.cu create mode 100644 test/test_cuda_u8_bit_width.cu create mode 100644 test/test_cuda_u8_bitswap.cu create mode 100644 test/test_cuda_u8_byteswap.cu create mode 100644 test/test_cuda_u8_countl_one.cu create mode 100644 test/test_cuda_u8_countl_zero.cu create mode 100644 test/test_cuda_u8_countr_one.cu create mode 100644 test/test_cuda_u8_countr_zero.cu create mode 100644 test/test_cuda_u8_has_single_bit.cu create mode 100644 test/test_cuda_u8_popcount.cu create mode 100644 test/test_cuda_u8_rotl.cu create mode 100644 test/test_cuda_u8_rotr.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index d00564a..86f9093 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -80,3 +80,80 @@ run test_cuda_u128_sub_error.cu ; run test_cuda_u128_mul_error.cu ; run test_cuda_u128_div_error.cu ; run test_cuda_u128_mod_error.cu ; + +# Bit function tests + +# u8 bit tests +run test_cuda_u8_has_single_bit.cu ; +run test_cuda_u8_bit_ceil.cu ; +run test_cuda_u8_bit_floor.cu ; +run test_cuda_u8_bit_width.cu ; +run test_cuda_u8_rotl.cu ; +run test_cuda_u8_rotr.cu ; +run test_cuda_u8_countl_zero.cu ; +run test_cuda_u8_countl_one.cu ; +run test_cuda_u8_countr_zero.cu ; +run test_cuda_u8_countr_one.cu ; +run test_cuda_u8_popcount.cu ; +run test_cuda_u8_byteswap.cu ; +run test_cuda_u8_bitswap.cu ; + +# u16 bit tests +run test_cuda_u16_has_single_bit.cu ; +run test_cuda_u16_bit_ceil.cu ; +run test_cuda_u16_bit_floor.cu ; +run test_cuda_u16_bit_width.cu ; +run test_cuda_u16_rotl.cu ; +run test_cuda_u16_rotr.cu ; +run test_cuda_u16_countl_zero.cu ; +run test_cuda_u16_countl_one.cu ; +run test_cuda_u16_countr_zero.cu ; +run test_cuda_u16_countr_one.cu ; +run test_cuda_u16_popcount.cu ; +run test_cuda_u16_byteswap.cu ; +run test_cuda_u16_bitswap.cu ; + +# u32 bit tests +run test_cuda_u32_has_single_bit.cu ; +run test_cuda_u32_bit_ceil.cu ; +run test_cuda_u32_bit_floor.cu ; +run test_cuda_u32_bit_width.cu ; +run test_cuda_u32_rotl.cu ; +run test_cuda_u32_rotr.cu ; +run test_cuda_u32_countl_zero.cu ; +run test_cuda_u32_countl_one.cu ; +run test_cuda_u32_countr_zero.cu ; +run test_cuda_u32_countr_one.cu ; +run test_cuda_u32_popcount.cu ; +run test_cuda_u32_byteswap.cu ; +run test_cuda_u32_bitswap.cu ; + +# u64 bit tests +run test_cuda_u64_has_single_bit.cu ; +run test_cuda_u64_bit_ceil.cu ; +run test_cuda_u64_bit_floor.cu ; +run test_cuda_u64_bit_width.cu ; +run test_cuda_u64_rotl.cu ; +run test_cuda_u64_rotr.cu ; +run test_cuda_u64_countl_zero.cu ; +run test_cuda_u64_countl_one.cu ; +run test_cuda_u64_countr_zero.cu ; +run test_cuda_u64_countr_one.cu ; +run test_cuda_u64_popcount.cu ; +run test_cuda_u64_byteswap.cu ; +run test_cuda_u64_bitswap.cu ; + +# u128 bit tests +run test_cuda_u128_has_single_bit.cu ; +run test_cuda_u128_bit_ceil.cu ; +run test_cuda_u128_bit_floor.cu ; +run test_cuda_u128_bit_width.cu ; +run test_cuda_u128_rotl.cu ; +run test_cuda_u128_rotr.cu ; +run test_cuda_u128_countl_zero.cu ; +run test_cuda_u128_countl_one.cu ; +run test_cuda_u128_countr_zero.cu ; +run test_cuda_u128_countr_one.cu ; +run test_cuda_u128_popcount.cu ; +run test_cuda_u128_byteswap.cu ; +run test_cuda_u128_bitswap.cu ; diff --git a/test/test_cuda_u128_bit_ceil.cu b/test/test_cuda_u128_bit_ceil.cu new file mode 100644 index 0000000..52ca8ec --- /dev/null +++ b/test/test_cuda_u128_bit_ceil.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_ceil(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)() / basis_type{2U}}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_ceil(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_bit_floor.cu b/test/test_cuda_u128_bit_floor.cu new file mode 100644 index 0000000..b22b8ba --- /dev/null +++ b/test/test_cuda_u128_bit_floor.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_floor(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_floor(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_bit_width.cu b/test/test_cuda_u128_bit_width.cu new file mode 100644 index 0000000..99ede69 --- /dev/null +++ b/test/test_cuda_u128_bit_width.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_width(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_width(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_bitswap.cu b/test/test_cuda_u128_bitswap.cu new file mode 100644 index 0000000..636d746 --- /dev/null +++ b/test/test_cuda_u128_bitswap.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bitswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bitswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_byteswap.cu b/test/test_cuda_u128_byteswap.cu new file mode 100644 index 0000000..8c449f8 --- /dev/null +++ b/test/test_cuda_u128_byteswap.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::byteswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::byteswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_countl_one.cu b/test/test_cuda_u128_countl_one.cu new file mode 100644 index 0000000..06caaef --- /dev/null +++ b/test/test_cuda_u128_countl_one.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_countl_zero.cu b/test/test_cuda_u128_countl_zero.cu new file mode 100644 index 0000000..74cbfe6 --- /dev/null +++ b/test/test_cuda_u128_countl_zero.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_countr_one.cu b/test/test_cuda_u128_countr_one.cu new file mode 100644 index 0000000..8fa6fa7 --- /dev/null +++ b/test/test_cuda_u128_countr_one.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_countr_zero.cu b/test/test_cuda_u128_countr_zero.cu new file mode 100644 index 0000000..1fd6114 --- /dev/null +++ b/test/test_cuda_u128_countr_zero.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_has_single_bit.cu b/test/test_cuda_u128_has_single_bit.cu new file mode 100644 index 0000000..775398f --- /dev/null +++ b/test/test_cuda_u128_has_single_bit.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::has_single_bit(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::has_single_bit(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_popcount.cu b/test/test_cuda_u128_popcount.cu new file mode 100644 index 0000000..ad62bd6 --- /dev/null +++ b/test/test_cuda_u128_popcount.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::popcount(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::popcount(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_rotl.cu b/test/test_cuda_u128_rotl.cu new file mode 100644 index 0000000..3b33dd7 --- /dev/null +++ b/test/test_cuda_u128_rotl.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotl(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotl(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_rotr.cu b/test/test_cuda_u128_rotr.cu new file mode 100644 index 0000000..6e259de --- /dev/null +++ b/test/test_cuda_u128_rotr.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotr(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotr(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_bit_ceil.cu b/test/test_cuda_u16_bit_ceil.cu new file mode 100644 index 0000000..a8ddfb7 --- /dev/null +++ b/test/test_cuda_u16_bit_ceil.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_ceil(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)()) / 2U}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_ceil(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_bit_floor.cu b/test/test_cuda_u16_bit_floor.cu new file mode 100644 index 0000000..ef63dbe --- /dev/null +++ b/test/test_cuda_u16_bit_floor.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_floor(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_floor(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_bit_width.cu b/test/test_cuda_u16_bit_width.cu new file mode 100644 index 0000000..2085fc5 --- /dev/null +++ b/test/test_cuda_u16_bit_width.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_width(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_width(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_bitswap.cu b/test/test_cuda_u16_bitswap.cu new file mode 100644 index 0000000..db46116 --- /dev/null +++ b/test/test_cuda_u16_bitswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bitswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bitswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_byteswap.cu b/test/test_cuda_u16_byteswap.cu new file mode 100644 index 0000000..a9fcb6d --- /dev/null +++ b/test/test_cuda_u16_byteswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::byteswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::byteswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_countl_one.cu b/test/test_cuda_u16_countl_one.cu new file mode 100644 index 0000000..1fcc61c --- /dev/null +++ b/test/test_cuda_u16_countl_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_countl_zero.cu b/test/test_cuda_u16_countl_zero.cu new file mode 100644 index 0000000..e78dc50 --- /dev/null +++ b/test/test_cuda_u16_countl_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_countr_one.cu b/test/test_cuda_u16_countr_one.cu new file mode 100644 index 0000000..f900927 --- /dev/null +++ b/test/test_cuda_u16_countr_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_countr_zero.cu b/test/test_cuda_u16_countr_zero.cu new file mode 100644 index 0000000..4feddc9 --- /dev/null +++ b/test/test_cuda_u16_countr_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_has_single_bit.cu b/test/test_cuda_u16_has_single_bit.cu new file mode 100644 index 0000000..39bb369 --- /dev/null +++ b/test/test_cuda_u16_has_single_bit.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::has_single_bit(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::has_single_bit(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_popcount.cu b/test/test_cuda_u16_popcount.cu new file mode 100644 index 0000000..b883bf3 --- /dev/null +++ b/test/test_cuda_u16_popcount.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::popcount(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::popcount(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_rotl.cu b/test/test_cuda_u16_rotl.cu new file mode 100644 index 0000000..94f331a --- /dev/null +++ b/test/test_cuda_u16_rotl.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotl(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotl(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_rotr.cu b/test/test_cuda_u16_rotr.cu new file mode 100644 index 0000000..eeda3d0 --- /dev/null +++ b/test/test_cuda_u16_rotr.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotr(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotr(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_bit_ceil.cu b/test/test_cuda_u32_bit_ceil.cu new file mode 100644 index 0000000..f1b130a --- /dev/null +++ b/test/test_cuda_u32_bit_ceil.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_ceil(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)() / basis_type{2}}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_ceil(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_bit_floor.cu b/test/test_cuda_u32_bit_floor.cu new file mode 100644 index 0000000..987c4ba --- /dev/null +++ b/test/test_cuda_u32_bit_floor.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_floor(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_floor(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_bit_width.cu b/test/test_cuda_u32_bit_width.cu new file mode 100644 index 0000000..4ea5784 --- /dev/null +++ b/test/test_cuda_u32_bit_width.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_width(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_width(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_bitswap.cu b/test/test_cuda_u32_bitswap.cu new file mode 100644 index 0000000..2c8c5b2 --- /dev/null +++ b/test/test_cuda_u32_bitswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bitswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bitswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_byteswap.cu b/test/test_cuda_u32_byteswap.cu new file mode 100644 index 0000000..300e1d9 --- /dev/null +++ b/test/test_cuda_u32_byteswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::byteswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::byteswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_countl_one.cu b/test/test_cuda_u32_countl_one.cu new file mode 100644 index 0000000..b5d40e5 --- /dev/null +++ b/test/test_cuda_u32_countl_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_countl_zero.cu b/test/test_cuda_u32_countl_zero.cu new file mode 100644 index 0000000..f5b4284 --- /dev/null +++ b/test/test_cuda_u32_countl_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_countr_one.cu b/test/test_cuda_u32_countr_one.cu new file mode 100644 index 0000000..3e687cb --- /dev/null +++ b/test/test_cuda_u32_countr_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_countr_zero.cu b/test/test_cuda_u32_countr_zero.cu new file mode 100644 index 0000000..99028ef --- /dev/null +++ b/test/test_cuda_u32_countr_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_has_single_bit.cu b/test/test_cuda_u32_has_single_bit.cu new file mode 100644 index 0000000..308d0a6 --- /dev/null +++ b/test/test_cuda_u32_has_single_bit.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::has_single_bit(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::has_single_bit(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_popcount.cu b/test/test_cuda_u32_popcount.cu new file mode 100644 index 0000000..1b2678c --- /dev/null +++ b/test/test_cuda_u32_popcount.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::popcount(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::popcount(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_rotl.cu b/test/test_cuda_u32_rotl.cu new file mode 100644 index 0000000..ed06774 --- /dev/null +++ b/test/test_cuda_u32_rotl.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotl(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotl(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_rotr.cu b/test/test_cuda_u32_rotr.cu new file mode 100644 index 0000000..9b6b7a2 --- /dev/null +++ b/test/test_cuda_u32_rotr.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotr(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotr(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_bit_ceil.cu b/test/test_cuda_u64_bit_ceil.cu new file mode 100644 index 0000000..885f44c --- /dev/null +++ b/test/test_cuda_u64_bit_ceil.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_ceil(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)() / basis_type{2}}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_ceil(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_bit_floor.cu b/test/test_cuda_u64_bit_floor.cu new file mode 100644 index 0000000..18b61b0 --- /dev/null +++ b/test/test_cuda_u64_bit_floor.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_floor(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_floor(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_bit_width.cu b/test/test_cuda_u64_bit_width.cu new file mode 100644 index 0000000..ec04975 --- /dev/null +++ b/test/test_cuda_u64_bit_width.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_width(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_width(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_bitswap.cu b/test/test_cuda_u64_bitswap.cu new file mode 100644 index 0000000..ca98035 --- /dev/null +++ b/test/test_cuda_u64_bitswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bitswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bitswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_byteswap.cu b/test/test_cuda_u64_byteswap.cu new file mode 100644 index 0000000..e07beb1 --- /dev/null +++ b/test/test_cuda_u64_byteswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::byteswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::byteswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_countl_one.cu b/test/test_cuda_u64_countl_one.cu new file mode 100644 index 0000000..4f52634 --- /dev/null +++ b/test/test_cuda_u64_countl_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_countl_zero.cu b/test/test_cuda_u64_countl_zero.cu new file mode 100644 index 0000000..81d3d67 --- /dev/null +++ b/test/test_cuda_u64_countl_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_countr_one.cu b/test/test_cuda_u64_countr_one.cu new file mode 100644 index 0000000..de86742 --- /dev/null +++ b/test/test_cuda_u64_countr_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_countr_zero.cu b/test/test_cuda_u64_countr_zero.cu new file mode 100644 index 0000000..c348275 --- /dev/null +++ b/test/test_cuda_u64_countr_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_has_single_bit.cu b/test/test_cuda_u64_has_single_bit.cu new file mode 100644 index 0000000..76012cd --- /dev/null +++ b/test/test_cuda_u64_has_single_bit.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::has_single_bit(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::has_single_bit(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_popcount.cu b/test/test_cuda_u64_popcount.cu new file mode 100644 index 0000000..e48df1d --- /dev/null +++ b/test/test_cuda_u64_popcount.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::popcount(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::popcount(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_rotl.cu b/test/test_cuda_u64_rotl.cu new file mode 100644 index 0000000..a3240b1 --- /dev/null +++ b/test/test_cuda_u64_rotl.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotl(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotl(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_rotr.cu b/test/test_cuda_u64_rotr.cu new file mode 100644 index 0000000..3cf95de --- /dev/null +++ b/test/test_cuda_u64_rotr.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotr(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotr(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_bit_ceil.cu b/test/test_cuda_u8_bit_ceil.cu new file mode 100644 index 0000000..f84d37d --- /dev/null +++ b/test/test_cuda_u8_bit_ceil.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_ceil(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)()) / 2U}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_ceil(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_bit_floor.cu b/test/test_cuda_u8_bit_floor.cu new file mode 100644 index 0000000..5ef0598 --- /dev/null +++ b/test/test_cuda_u8_bit_floor.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_floor(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_floor(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_bit_width.cu b/test/test_cuda_u8_bit_width.cu new file mode 100644 index 0000000..3ae3bbc --- /dev/null +++ b/test/test_cuda_u8_bit_width.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bit_width(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bit_width(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_bitswap.cu b/test/test_cuda_u8_bitswap.cu new file mode 100644 index 0000000..d2d0d56 --- /dev/null +++ b/test/test_cuda_u8_bitswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::bitswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::bitswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_byteswap.cu b/test/test_cuda_u8_byteswap.cu new file mode 100644 index 0000000..9a6e4d5 --- /dev/null +++ b/test/test_cuda_u8_byteswap.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::byteswap(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::byteswap(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_countl_one.cu b/test/test_cuda_u8_countl_one.cu new file mode 100644 index 0000000..b5b89fe --- /dev/null +++ b/test/test_cuda_u8_countl_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_countl_zero.cu b/test/test_cuda_u8_countl_zero.cu new file mode 100644 index 0000000..68ba382 --- /dev/null +++ b/test/test_cuda_u8_countl_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countl_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countl_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_countr_one.cu b/test/test_cuda_u8_countr_one.cu new file mode 100644 index 0000000..4466c8c --- /dev/null +++ b/test/test_cuda_u8_countr_one.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_one(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_one(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_countr_zero.cu b/test/test_cuda_u8_countr_zero.cu new file mode 100644 index 0000000..9902dd0 --- /dev/null +++ b/test/test_cuda_u8_countr_zero.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::countr_zero(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::countr_zero(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_has_single_bit.cu b/test/test_cuda_u8_has_single_bit.cu new file mode 100644 index 0000000..4c30350 --- /dev/null +++ b/test/test_cuda_u8_has_single_bit.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::has_single_bit(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::has_single_bit(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_popcount.cu b/test/test_cuda_u8_popcount.cu new file mode 100644 index 0000000..cd27d91 --- /dev/null +++ b/test/test_cuda_u8_popcount.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::popcount(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::popcount(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_rotl.cu b/test/test_cuda_u8_rotl.cu new file mode 100644 index 0000000..a45e622 --- /dev/null +++ b/test/test_cuda_u8_rotl.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotl(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotl(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_rotr.cu b/test/test_cuda_u8_rotr.cu new file mode 100644 index 0000000..47cfd9e --- /dev/null +++ b/test/test_cuda_u8_rotr.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::rotr(in[i], 3); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::rotr(input_vector[i], 3)); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From b394b4707995c0aea63cb96fe30e1c260dd43ef4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:09:22 -0400 Subject: [PATCH 03/36] Use CUDA provided functions when available --- include/boost/safe_numbers/bit.hpp | 131 ++++++++++++++++++++++++++--- 1 file changed, 118 insertions(+), 13 deletions(-) diff --git a/include/boost/safe_numbers/bit.hpp b/include/boost/safe_numbers/bit.hpp index 38c3ce7..9c212f1 100644 --- a/include/boost/safe_numbers/bit.hpp +++ b/include/boost/safe_numbers/bit.hpp @@ -11,10 +11,19 @@ #ifndef BOOST_SAFE_NUMBERS_BUILD_MODULE -#include #include + +#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + +#include + +#else + +#include #include +#endif + #endif // BOOST_SAFE_NUMBERS_BUILD_MODULE namespace boost::safe_numbers { @@ -22,109 +31,205 @@ namespace boost::safe_numbers { BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto has_single_bit(const UnsignedInt x) noexcept -> bool { - using boost::core::has_single_bit; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::has_single_bit; return has_single_bit(static_cast(x)); + + #else + + return cuda::std::has_single_bit(static_cast(x)); + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_ceil(const UnsignedInt x) noexcept -> UnsignedInt { - using boost::core::bit_ceil; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::bit_ceil; return UnsignedInt{bit_ceil(static_cast(x))}; + + #else + + return UnsignedInt{cuda::std::bit_ceil(static_cast(x))}; + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_floor(const UnsignedInt x) noexcept -> UnsignedInt { - using boost::core::bit_floor; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::bit_floor; return UnsignedInt{bit_floor(static_cast(x))}; + + #else + + return UnsignedInt{cuda::std::bit_floor(static_cast(x))}; + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_width(const UnsignedInt x) noexcept -> int { - using boost::core::bit_width; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::bit_width; return static_cast(bit_width(static_cast(x))); + + #else + + return static_cast(cuda::std::bit_width(static_cast(x))); + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotl(const UnsignedInt x, const int s) noexcept -> UnsignedInt { - using boost::core::rotl; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::rotl; return UnsignedInt{rotl(static_cast(x), s)}; + + #else + + return UnsignedInt{cuda::std::rotl(static_cast(x), s)}; + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotr(const UnsignedInt x, const int s) noexcept -> UnsignedInt { - using boost::core::rotr; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::rotr; return UnsignedInt{rotr(static_cast(x), s)}; + + #else + + return UnsignedInt{cuda::std::rotr(static_cast(x), s)}; + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_zero(const UnsignedInt x) noexcept -> int { - using boost::core::countl_zero; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::countl_zero; return countl_zero(static_cast(x)); + + #else + + return cuda::std::countl_zero(static_cast(x)); + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_one(const UnsignedInt x) noexcept -> int { - using boost::core::countl_one; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::countl_one; return countl_one(static_cast(x)); + + #else + + return cuda::std::countl_one(static_cast(x)); + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_zero(const UnsignedInt x) noexcept -> int { - using boost::core::countr_zero; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::countr_zero; return countr_zero(static_cast(x)); + + #else + + return cuda::std::countr_zero(static_cast(x)); + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_one(const UnsignedInt x) noexcept -> int { - using boost::core::countr_one; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::countr_one; return countr_one(static_cast(x)); + + #else + + return cuda::std::countr_one(static_cast(x)); + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto popcount(const UnsignedInt x) noexcept -> int { - using boost::core::popcount; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::popcount; return popcount(static_cast(x)); + + #else + + return cuda::std::popcount(static_cast(x)); + + #endif } BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(const Int x) noexcept -> Int { - using boost::core::byteswap; using underlying_type = detail::underlying_type_t; + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + using boost::core::byteswap; return Int{byteswap(static_cast(x))}; + + #else + + return Int{cuda::std::byteswap(static_cast(x))}; + + #endif } namespace detail { From 613cb0fdb6a37c68bfc8de9c6d6639c621cfcca0 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:18:24 -0400 Subject: [PATCH 04/36] Fix implementation for u128 --- include/boost/safe_numbers/bit.hpp | 108 +++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 12 deletions(-) diff --git a/include/boost/safe_numbers/bit.hpp b/include/boost/safe_numbers/bit.hpp index 9c212f1..640f1ff 100644 --- a/include/boost/safe_numbers/bit.hpp +++ b/include/boost/safe_numbers/bit.hpp @@ -40,7 +40,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto has_single_bit(const #else - return cuda::std::has_single_bit(static_cast(x)); + if constexpr (std::is_same_v) + { + return boost::int128::has_single_bit(static_cast(x)); + } + else + { + return cuda::std::has_single_bit(static_cast(x)); + } #endif } @@ -57,7 +64,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_ceil(const Unsig #else - return UnsignedInt{cuda::std::bit_ceil(static_cast(x))}; + if constexpr (std::is_same_v) + { + return UnsignedInt{boost::int128::bit_ceil(static_cast(x))}; + } + else + { + return UnsignedInt{cuda::std::bit_ceil(static_cast(x))}; + } #endif } @@ -74,7 +88,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_floor(const Unsi #else - return UnsignedInt{cuda::std::bit_floor(static_cast(x))}; + if constexpr (std::is_same_v) + { + return UnsignedInt{boost::int128::bit_floor(static_cast(x))}; + } + else + { + return UnsignedInt{cuda::std::bit_floor(static_cast(x))}; + } #endif } @@ -91,7 +112,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_width(const Unsi #else - return static_cast(cuda::std::bit_width(static_cast(x))); + if constexpr (std::is_same_v) + { + return static_cast(boost::int128::bit_width(static_cast(x))); + } + else + { + return static_cast(cuda::std::bit_width(static_cast(x))); + } #endif } @@ -108,7 +136,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotl(const UnsignedI #else - return UnsignedInt{cuda::std::rotl(static_cast(x), s)}; + if constexpr (std::is_same_v) + { + return UnsignedInt{boost::int128::rotl(static_cast(x), s)}; + } + else + { + return UnsignedInt{cuda::std::rotl(static_cast(x), s)}; + } #endif } @@ -125,7 +160,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotr(const UnsignedI #else - return UnsignedInt{cuda::std::rotr(static_cast(x), s)}; + if constexpr (std::is_same_v) + { + return UnsignedInt{boost::int128::rotr(static_cast(x), s)}; + } + else + { + return UnsignedInt{cuda::std::rotr(static_cast(x), s)}; + } #endif } @@ -142,7 +184,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_zero(const Un #else - return cuda::std::countl_zero(static_cast(x)); + if constexpr (std::is_same_v) + { + return boost::int128::countl_zero(static_cast(x)); + } + else + { + return cuda::std::countl_zero(static_cast(x)); + } #endif } @@ -159,7 +208,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_one(const Uns #else - return cuda::std::countl_one(static_cast(x)); + if constexpr (std::is_same_v) + { + return boost::int128::countl_one(static_cast(x)); + } + else + { + return cuda::std::countl_one(static_cast(x)); + } #endif } @@ -176,7 +232,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_zero(const Un #else - return cuda::std::countr_zero(static_cast(x)); + if constexpr (std::is_same_v) + { + return boost::int128::countr_zero(static_cast(x)); + } + else + { + return cuda::std::countr_zero(static_cast(x)); + } #endif } @@ -193,7 +256,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_one(const Uns #else - return cuda::std::countr_one(static_cast(x)); + if constexpr (std::is_same_v) + { + return boost::int128::countr_one(static_cast(x)); + } + else + { + return cuda::std::countr_one(static_cast(x)); + } #endif } @@ -210,7 +280,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto popcount(const Unsig #else - return cuda::std::popcount(static_cast(x)); + if constexpr (std::is_same_v) + { + return boost::int128::popcount(static_cast(x)); + } + else + { + return cuda::std::popcount(static_cast(x)); + } #endif } @@ -227,7 +304,14 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(const Int x #else - return Int{cuda::std::byteswap(static_cast(x))}; + if constexpr (std::is_same_v) + { + return Int{boost::int128::byteswap(static_cast(x))}; + } + else + { + return Int{cuda::std::byteswap(static_cast(x))}; + } #endif } From 7c523e2d42026ba1f9f99e593bef6c994eb22f9e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:26:52 -0400 Subject: [PATCH 05/36] Add CUDA support to byte conversions functions --- .../boost/safe_numbers/byte_conversions.hpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/boost/safe_numbers/byte_conversions.hpp b/include/boost/safe_numbers/byte_conversions.hpp index cba4b15..f74fa96 100644 --- a/include/boost/safe_numbers/byte_conversions.hpp +++ b/include/boost/safe_numbers/byte_conversions.hpp @@ -21,7 +21,7 @@ namespace boost::safe_numbers { template -[[nodiscard]] constexpr auto to_be(const T value) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_be(const T value) noexcept -> T { if constexpr (std::endian::native == std::endian::big) { @@ -34,14 +34,14 @@ template } template -[[nodiscard]] constexpr auto from_be(const T value) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_be(const T value) noexcept -> T { // Self-inverse return to_be(value); } template -[[nodiscard]] constexpr auto to_le(const T value) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_le(const T value) noexcept -> T { if constexpr (std::endian::native == std::endian::little) { @@ -54,21 +54,21 @@ template } template -[[nodiscard]] constexpr auto from_le(const T value) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_le(const T value) noexcept -> T { // Self-inverse return to_le(value); } template -[[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array { const auto be_value {to_be(value)}; return std::bit_cast>(be_value); } template -[[nodiscard]] constexpr auto from_be_bytes(const std::span bytes) -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_be_bytes(const std::span bytes) -> T { using underlying_type = detail::underlying_type_t; @@ -103,14 +103,14 @@ template } template -[[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array { const auto le_value {to_le(value)}; return std::bit_cast>(le_value); } template -[[nodiscard]] constexpr auto from_le_bytes(const std::span bytes) -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_le_bytes(const std::span bytes) -> T { using underlying_type = detail::underlying_type_t; @@ -145,7 +145,7 @@ template } template -[[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array { if constexpr (std::endian::native == std::endian::little) { @@ -158,7 +158,7 @@ template } template -[[nodiscard]] constexpr auto from_ne_bytes(const std::span bytes) -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_ne_bytes(const std::span bytes) -> T { if constexpr (std::endian::native == std::endian::little) { From 398de51c5061e33284bbbc9e31e3e9902fe2afaa Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:33:31 -0400 Subject: [PATCH 06/36] Test byte conversions on CUDA --- test/cuda_jamfile | 32 +++++++++++++ test/test_cuda_u128_from_be.cu | 87 ++++++++++++++++++++++++++++++++++ test/test_cuda_u128_from_le.cu | 87 ++++++++++++++++++++++++++++++++++ test/test_cuda_u128_to_be.cu | 87 ++++++++++++++++++++++++++++++++++ test/test_cuda_u128_to_le.cu | 87 ++++++++++++++++++++++++++++++++++ test/test_cuda_u16_from_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u16_from_le.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u16_to_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u16_to_le.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u32_from_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u32_from_le.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u32_to_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u32_to_le.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u64_from_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u64_from_le.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u64_to_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u64_to_le.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u8_from_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u8_from_le.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u8_to_be.cu | 82 ++++++++++++++++++++++++++++++++ test/test_cuda_u8_to_le.cu | 82 ++++++++++++++++++++++++++++++++ 21 files changed, 1692 insertions(+) create mode 100644 test/test_cuda_u128_from_be.cu create mode 100644 test/test_cuda_u128_from_le.cu create mode 100644 test/test_cuda_u128_to_be.cu create mode 100644 test/test_cuda_u128_to_le.cu create mode 100644 test/test_cuda_u16_from_be.cu create mode 100644 test/test_cuda_u16_from_le.cu create mode 100644 test/test_cuda_u16_to_be.cu create mode 100644 test/test_cuda_u16_to_le.cu create mode 100644 test/test_cuda_u32_from_be.cu create mode 100644 test/test_cuda_u32_from_le.cu create mode 100644 test/test_cuda_u32_to_be.cu create mode 100644 test/test_cuda_u32_to_le.cu create mode 100644 test/test_cuda_u64_from_be.cu create mode 100644 test/test_cuda_u64_from_le.cu create mode 100644 test/test_cuda_u64_to_be.cu create mode 100644 test/test_cuda_u64_to_le.cu create mode 100644 test/test_cuda_u8_from_be.cu create mode 100644 test/test_cuda_u8_from_le.cu create mode 100644 test/test_cuda_u8_to_be.cu create mode 100644 test/test_cuda_u8_to_le.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 86f9093..099be30 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -157,3 +157,35 @@ run test_cuda_u128_countr_one.cu ; run test_cuda_u128_popcount.cu ; run test_cuda_u128_byteswap.cu ; run test_cuda_u128_bitswap.cu ; + +# Byte conversion tests + +# u8 byte conversion tests +run test_cuda_u8_to_be.cu ; +run test_cuda_u8_from_be.cu ; +run test_cuda_u8_to_le.cu ; +run test_cuda_u8_from_le.cu ; + +# u16 byte conversion tests +run test_cuda_u16_to_be.cu ; +run test_cuda_u16_from_be.cu ; +run test_cuda_u16_to_le.cu ; +run test_cuda_u16_from_le.cu ; + +# u32 byte conversion tests +run test_cuda_u32_to_be.cu ; +run test_cuda_u32_from_be.cu ; +run test_cuda_u32_to_le.cu ; +run test_cuda_u32_from_le.cu ; + +# u64 byte conversion tests +run test_cuda_u64_to_be.cu ; +run test_cuda_u64_from_be.cu ; +run test_cuda_u64_to_le.cu ; +run test_cuda_u64_from_le.cu ; + +# u128 byte conversion tests +run test_cuda_u128_to_be.cu ; +run test_cuda_u128_from_be.cu ; +run test_cuda_u128_to_le.cu ; +run test_cuda_u128_from_le.cu ; diff --git a/test/test_cuda_u128_from_be.cu b/test/test_cuda_u128_from_be.cu new file mode 100644 index 0000000..56476d9 --- /dev/null +++ b/test/test_cuda_u128_from_be.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_from_le.cu b/test/test_cuda_u128_from_le.cu new file mode 100644 index 0000000..4558a42 --- /dev/null +++ b/test/test_cuda_u128_from_le.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_to_be.cu b/test/test_cuda_u128_to_be.cu new file mode 100644 index 0000000..0e1e6cf --- /dev/null +++ b/test/test_cuda_u128_to_be.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_to_le.cu b/test/test_cuda_u128_to_le.cu new file mode 100644 index 0000000..461d47d --- /dev/null +++ b/test/test_cuda_u128_to_le.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_from_be.cu b/test/test_cuda_u16_from_be.cu new file mode 100644 index 0000000..1eda1cd --- /dev/null +++ b/test/test_cuda_u16_from_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_from_le.cu b/test/test_cuda_u16_from_le.cu new file mode 100644 index 0000000..389a8fd --- /dev/null +++ b/test/test_cuda_u16_from_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_to_be.cu b/test/test_cuda_u16_to_be.cu new file mode 100644 index 0000000..9268e37 --- /dev/null +++ b/test/test_cuda_u16_to_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_to_le.cu b/test/test_cuda_u16_to_le.cu new file mode 100644 index 0000000..7b0ce48 --- /dev/null +++ b/test/test_cuda_u16_to_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_from_be.cu b/test/test_cuda_u32_from_be.cu new file mode 100644 index 0000000..409cdda --- /dev/null +++ b/test/test_cuda_u32_from_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_from_le.cu b/test/test_cuda_u32_from_le.cu new file mode 100644 index 0000000..358f4f4 --- /dev/null +++ b/test/test_cuda_u32_from_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_to_be.cu b/test/test_cuda_u32_to_be.cu new file mode 100644 index 0000000..abf22d7 --- /dev/null +++ b/test/test_cuda_u32_to_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_to_le.cu b/test/test_cuda_u32_to_le.cu new file mode 100644 index 0000000..2d31f30 --- /dev/null +++ b/test/test_cuda_u32_to_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_from_be.cu b/test/test_cuda_u64_from_be.cu new file mode 100644 index 0000000..e867176 --- /dev/null +++ b/test/test_cuda_u64_from_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_from_le.cu b/test/test_cuda_u64_from_le.cu new file mode 100644 index 0000000..29e4024 --- /dev/null +++ b/test/test_cuda_u64_from_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_to_be.cu b/test/test_cuda_u64_to_be.cu new file mode 100644 index 0000000..01613a0 --- /dev/null +++ b/test/test_cuda_u64_to_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_to_le.cu b/test/test_cuda_u64_to_le.cu new file mode 100644 index 0000000..80ce98a --- /dev/null +++ b/test/test_cuda_u64_to_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_from_be.cu b/test/test_cuda_u8_from_be.cu new file mode 100644 index 0000000..b914fdd --- /dev/null +++ b/test/test_cuda_u8_from_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_from_le.cu b/test/test_cuda_u8_from_le.cu new file mode 100644 index 0000000..4f669cc --- /dev/null +++ b/test/test_cuda_u8_from_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::from_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::from_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_to_be.cu b/test/test_cuda_u8_to_be.cu new file mode 100644 index 0000000..ae5801c --- /dev/null +++ b/test/test_cuda_u8_to_be.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_be(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_be(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_to_le.cu b/test/test_cuda_u8_to_le.cu new file mode 100644 index 0000000..d51dd61 --- /dev/null +++ b/test/test_cuda_u8_to_le.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::to_le(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::to_le(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 9671ba946577ad4cbeb301920b44b49aac34c644 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:41:14 -0400 Subject: [PATCH 07/36] Add CUDA support to charconv functions --- include/boost/safe_numbers/charconv.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/boost/safe_numbers/charconv.hpp b/include/boost/safe_numbers/charconv.hpp index 9b43c39..bd8c4e5 100644 --- a/include/boost/safe_numbers/charconv.hpp +++ b/include/boost/safe_numbers/charconv.hpp @@ -18,7 +18,7 @@ namespace boost::charconv { template -constexpr auto from_chars(const char* first, const char* last, T& value, int base = 10) +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto from_chars(const char* first, const char* last, T& value, int base = 10) -> charconv::from_chars_result { using underlying_type = safe_numbers::detail::underlying_type_t; @@ -31,7 +31,7 @@ constexpr auto from_chars(const char* first, const char* last, T& value, int bas } template -constexpr auto to_chars(char* first, char* last, const T value, int base = 10) +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto to_chars(char* first, char* last, const T value, int base = 10) -> charconv::to_chars_result { using underlying_type = safe_numbers::detail::underlying_type_t; From a8a70970d61d36d0a38d664e0fe308d8de56d21b Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:41:24 -0400 Subject: [PATCH 08/36] Add CUDA testing of charconv functions --- test/cuda_jamfile | 22 +++++ test/test_cuda_u128_charconv.cu | 95 ++++++++++++++++++ test/test_cuda_u128_charconv_all_bases.cu | 111 ++++++++++++++++++++++ test/test_cuda_u16_charconv.cu | 90 ++++++++++++++++++ test/test_cuda_u16_charconv_all_bases.cu | 106 +++++++++++++++++++++ test/test_cuda_u32_charconv.cu | 90 ++++++++++++++++++ test/test_cuda_u32_charconv_all_bases.cu | 106 +++++++++++++++++++++ test/test_cuda_u64_charconv.cu | 90 ++++++++++++++++++ test/test_cuda_u64_charconv_all_bases.cu | 106 +++++++++++++++++++++ test/test_cuda_u8_charconv.cu | 90 ++++++++++++++++++ test/test_cuda_u8_charconv_all_bases.cu | 106 +++++++++++++++++++++ 11 files changed, 1012 insertions(+) create mode 100644 test/test_cuda_u128_charconv.cu create mode 100644 test/test_cuda_u128_charconv_all_bases.cu create mode 100644 test/test_cuda_u16_charconv.cu create mode 100644 test/test_cuda_u16_charconv_all_bases.cu create mode 100644 test/test_cuda_u32_charconv.cu create mode 100644 test/test_cuda_u32_charconv_all_bases.cu create mode 100644 test/test_cuda_u64_charconv.cu create mode 100644 test/test_cuda_u64_charconv_all_bases.cu create mode 100644 test/test_cuda_u8_charconv.cu create mode 100644 test/test_cuda_u8_charconv_all_bases.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 099be30..efc2d0f 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -189,3 +189,25 @@ run test_cuda_u128_to_be.cu ; run test_cuda_u128_from_be.cu ; run test_cuda_u128_to_le.cu ; run test_cuda_u128_from_le.cu ; + +# Charconv tests + +# u8 charconv tests +run test_cuda_u8_charconv.cu ; +run test_cuda_u8_charconv_all_bases.cu ; + +# u16 charconv tests +run test_cuda_u16_charconv.cu ; +run test_cuda_u16_charconv_all_bases.cu ; + +# u32 charconv tests +run test_cuda_u32_charconv.cu ; +run test_cuda_u32_charconv_all_bases.cu ; + +# u64 charconv tests +run test_cuda_u64_charconv.cu ; +run test_cuda_u64_charconv_all_bases.cu ; + +# u128 charconv tests +run test_cuda_u128_charconv.cu ; +run test_cuda_u128_charconv_all_bases.cu ; diff --git a/test/test_cuda_u128_charconv.cu b/test/test_cuda_u128_charconv.cu new file mode 100644 index 0000000..7a7e2b8 --- /dev/null +++ b/test/test_cuda_u128_charconv.cu @@ -0,0 +1,95 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + out[i] = parsed; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + results.push_back(parsed); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_charconv_all_bases.cu b/test/test_cuda_u128_charconv_all_bases.cu new file mode 100644 index 0000000..762ba5a --- /dev/null +++ b/test/test_cuda_u128_charconv_all_bases.cu @@ -0,0 +1,111 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == in[i]) + { + ++pass_count; + } + } + out[i] = pass_count; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == input_vector[i]) + { + ++pass_count; + } + } + results.push_back(pass_count); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_charconv.cu b/test/test_cuda_u16_charconv.cu new file mode 100644 index 0000000..cd53a19 --- /dev/null +++ b/test/test_cuda_u16_charconv.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + out[i] = parsed; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + results.push_back(parsed); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_charconv_all_bases.cu b/test/test_cuda_u16_charconv_all_bases.cu new file mode 100644 index 0000000..50f64a7 --- /dev/null +++ b/test/test_cuda_u16_charconv_all_bases.cu @@ -0,0 +1,106 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == in[i]) + { + ++pass_count; + } + } + out[i] = pass_count; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == input_vector[i]) + { + ++pass_count; + } + } + results.push_back(pass_count); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_charconv.cu b/test/test_cuda_u32_charconv.cu new file mode 100644 index 0000000..b618742 --- /dev/null +++ b/test/test_cuda_u32_charconv.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + out[i] = parsed; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + results.push_back(parsed); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_charconv_all_bases.cu b/test/test_cuda_u32_charconv_all_bases.cu new file mode 100644 index 0000000..70c43e7 --- /dev/null +++ b/test/test_cuda_u32_charconv_all_bases.cu @@ -0,0 +1,106 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == in[i]) + { + ++pass_count; + } + } + out[i] = pass_count; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == input_vector[i]) + { + ++pass_count; + } + } + results.push_back(pass_count); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_charconv.cu b/test/test_cuda_u64_charconv.cu new file mode 100644 index 0000000..75998ff --- /dev/null +++ b/test/test_cuda_u64_charconv.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + out[i] = parsed; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + results.push_back(parsed); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_charconv_all_bases.cu b/test/test_cuda_u64_charconv_all_bases.cu new file mode 100644 index 0000000..5de01ea --- /dev/null +++ b/test/test_cuda_u64_charconv_all_bases.cu @@ -0,0 +1,106 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == in[i]) + { + ++pass_count; + } + } + out[i] = pass_count; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == input_vector[i]) + { + ++pass_count; + } + } + results.push_back(pass_count); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_charconv.cu b/test/test_cuda_u8_charconv.cu new file mode 100644 index 0000000..daaed8c --- /dev/null +++ b/test/test_cuda_u8_charconv.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + out[i] = parsed; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + char buf[64] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i])}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed); + results.push_back(parsed); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_charconv_all_bases.cu b/test/test_cuda_u8_charconv_all_bases.cu new file mode 100644 index 0000000..481caf1 --- /dev/null +++ b/test/test_cuda_u8_charconv_all_bases.cu @@ -0,0 +1,106 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), in[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == in[i]) + { + ++pass_count; + } + } + out[i] = pass_count; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + int pass_count {0}; + for (int base = 2; base <= 36; ++base) + { + char buf[256] {}; + auto tc_result {boost::charconv::to_chars(buf, buf + sizeof(buf), input_vector[i], base)}; + test_type parsed {}; + boost::charconv::from_chars(buf, tc_result.ptr, parsed, base); + if (parsed == input_vector[i]) + { + ++pass_count; + } + } + results.push_back(pass_count); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From c6c150b9a08b36c6059b9a4b542a3eeab82d3e97 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:45:32 -0400 Subject: [PATCH 09/36] Add CUDA support to integer utilities --- include/boost/safe_numbers/detail/rtz.hpp | 12 +++++----- .../boost/safe_numbers/integer_utilities.hpp | 24 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/boost/safe_numbers/detail/rtz.hpp b/include/boost/safe_numbers/detail/rtz.hpp index d2018ee..2d12c98 100644 --- a/include/boost/safe_numbers/detail/rtz.hpp +++ b/include/boost/safe_numbers/detail/rtz.hpp @@ -21,7 +21,7 @@ namespace boost::safe_numbers::detail { // n is assumed to be at most of bit_width bits template -constexpr auto rotr(UInt n, unsigned int r) noexcept -> UInt +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto rotr(UInt n, unsigned int r) noexcept -> UInt { static_assert(bit_width >= std::numeric_limits::digits); @@ -38,7 +38,7 @@ struct remove_trailing_zeros_return std::size_t number_of_removed_zeros; }; -constexpr auto remove_trailing_zeros(std::uint8_t n) noexcept -> remove_trailing_zeros_return +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint8_t n) noexcept -> remove_trailing_zeros_return { std::size_t s {}; @@ -55,7 +55,7 @@ constexpr auto remove_trailing_zeros(std::uint8_t n) noexcept -> remove_trailing return {n, s}; } -constexpr auto remove_trailing_zeros(std::uint16_t n) noexcept -> remove_trailing_zeros_return +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint16_t n) noexcept -> remove_trailing_zeros_return { std::size_t s {}; @@ -77,7 +77,7 @@ constexpr auto remove_trailing_zeros(std::uint16_t n) noexcept -> remove_trailin return {n, s}; } -constexpr auto remove_trailing_zeros(std::uint32_t n) noexcept -> remove_trailing_zeros_return +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint32_t n) noexcept -> remove_trailing_zeros_return { std::size_t s {}; @@ -104,7 +104,7 @@ constexpr auto remove_trailing_zeros(std::uint32_t n) noexcept -> remove_trailin return {n, s}; } -constexpr auto remove_trailing_zeros(std::uint64_t n) noexcept -> remove_trailing_zeros_return +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(std::uint64_t n) noexcept -> remove_trailing_zeros_return { std::size_t s {}; @@ -136,7 +136,7 @@ constexpr auto remove_trailing_zeros(std::uint64_t n) noexcept -> remove_trailin return {n, s}; } -constexpr auto remove_trailing_zeros(int128::uint128_t n) noexcept -> remove_trailing_zeros_return +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto remove_trailing_zeros(int128::uint128_t n) noexcept -> remove_trailing_zeros_return { std::size_t s {}; diff --git a/include/boost/safe_numbers/integer_utilities.hpp b/include/boost/safe_numbers/integer_utilities.hpp index 991b7a9..66d207b 100644 --- a/include/boost/safe_numbers/integer_utilities.hpp +++ b/include/boost/safe_numbers/integer_utilities.hpp @@ -15,7 +15,7 @@ namespace boost::safe_numbers { // Newton's method as it can't possibly overflow, and converges rapidly template -[[nodiscard]] constexpr auto isqrt(const T val) -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto isqrt(const T val) -> T { using underlying_type = detail::underlying_type_t; @@ -42,7 +42,7 @@ template } template -[[nodiscard]] constexpr auto remove_trailing_zeros(const T n) +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto remove_trailing_zeros(const T n) { using underlying_type = detail::underlying_type_t; @@ -55,7 +55,7 @@ template } template -[[nodiscard]] constexpr auto is_power_10(const T n) -> bool +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_10(const T n) -> bool { using underlying_type = detail::underlying_type_t; @@ -64,14 +64,14 @@ template } template -[[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool { return has_single_bit(n); } // Integer log base 2: floor(log2(n)) == bit_width(n) - 1 template -[[nodiscard]] constexpr auto ilog2(const T n) -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog2(const T n) -> int { using underlying_type = detail::underlying_type_t; @@ -86,7 +86,7 @@ template // Integer log base 10: floor(ilog10(n)) == num_digits(n) - 1 // Uses MSB-based approximation with power-of-10 table lookup (O(1)) template -[[nodiscard]] constexpr auto ilog10(const T n) -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog10(const T n) -> int { using underlying_type = detail::underlying_type_t; @@ -101,7 +101,7 @@ template // Integer log arbitrary base: floor(log_base(n)) // Repeated division: O(log_base(n)) divisions template -[[nodiscard]] constexpr auto ilog(const T n, const T base) -> int +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog(const T n, const T base) -> int { using underlying_type = detail::underlying_type_t; @@ -132,7 +132,7 @@ namespace detail { // Iterative exponentiation by squaring: O(log b) multiplications template -[[nodiscard]] constexpr auto ipow_impl(T base, T exp) -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ipow_impl(T base, T exp) -> T { using underlying_type = underlying_type_t; @@ -157,19 +157,19 @@ template } // namespace detail template -[[nodiscard]] constexpr auto ipow(const T a, const T b) -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ipow(const T a, const T b) -> T { return detail::ipow_impl(a, b); } template -[[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T { return a > b ? a - b : b - a; } template -[[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T { using underlying_type = detail::underlying_type_t; @@ -187,7 +187,7 @@ template } template -[[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T { return div_ceil(a, b) * b; } From 2aaf8ad3c224b497af93abd896ff3ba951b3f8e4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:50:04 -0400 Subject: [PATCH 10/36] Add CUDA testing of integer utilities --- test/cuda_jamfile | 67 +++++++++++++++ test/test_cuda_u128_abs_diff.cu | 90 ++++++++++++++++++++ test/test_cuda_u128_div_ceil.cu | 90 ++++++++++++++++++++ test/test_cuda_u128_ilog.cu | 87 +++++++++++++++++++ test/test_cuda_u128_ilog10.cu | 87 +++++++++++++++++++ test/test_cuda_u128_ilog2.cu | 87 +++++++++++++++++++ test/test_cuda_u128_ipow.cu | 90 ++++++++++++++++++++ test/test_cuda_u128_is_power_10.cu | 87 +++++++++++++++++++ test/test_cuda_u128_is_power_2.cu | 87 +++++++++++++++++++ test/test_cuda_u128_isqrt.cu | 87 +++++++++++++++++++ test/test_cuda_u128_next_multiple_of.cu | 90 ++++++++++++++++++++ test/test_cuda_u128_remove_trailing_zeros.cu | 87 +++++++++++++++++++ test/test_cuda_u16_abs_diff.cu | 85 ++++++++++++++++++ test/test_cuda_u16_div_ceil.cu | 85 ++++++++++++++++++ test/test_cuda_u16_ilog.cu | 82 ++++++++++++++++++ test/test_cuda_u16_ilog10.cu | 82 ++++++++++++++++++ test/test_cuda_u16_ilog2.cu | 82 ++++++++++++++++++ test/test_cuda_u16_ipow.cu | 85 ++++++++++++++++++ test/test_cuda_u16_is_power_10.cu | 82 ++++++++++++++++++ test/test_cuda_u16_is_power_2.cu | 82 ++++++++++++++++++ test/test_cuda_u16_isqrt.cu | 82 ++++++++++++++++++ test/test_cuda_u16_next_multiple_of.cu | 85 ++++++++++++++++++ test/test_cuda_u16_remove_trailing_zeros.cu | 82 ++++++++++++++++++ test/test_cuda_u32_abs_diff.cu | 85 ++++++++++++++++++ test/test_cuda_u32_div_ceil.cu | 85 ++++++++++++++++++ test/test_cuda_u32_ilog.cu | 82 ++++++++++++++++++ test/test_cuda_u32_ilog10.cu | 82 ++++++++++++++++++ test/test_cuda_u32_ilog2.cu | 82 ++++++++++++++++++ test/test_cuda_u32_ipow.cu | 85 ++++++++++++++++++ test/test_cuda_u32_is_power_10.cu | 82 ++++++++++++++++++ test/test_cuda_u32_is_power_2.cu | 82 ++++++++++++++++++ test/test_cuda_u32_isqrt.cu | 82 ++++++++++++++++++ test/test_cuda_u32_next_multiple_of.cu | 85 ++++++++++++++++++ test/test_cuda_u32_remove_trailing_zeros.cu | 82 ++++++++++++++++++ test/test_cuda_u64_abs_diff.cu | 85 ++++++++++++++++++ test/test_cuda_u64_div_ceil.cu | 85 ++++++++++++++++++ test/test_cuda_u64_ilog.cu | 82 ++++++++++++++++++ test/test_cuda_u64_ilog10.cu | 82 ++++++++++++++++++ test/test_cuda_u64_ilog2.cu | 82 ++++++++++++++++++ test/test_cuda_u64_ipow.cu | 85 ++++++++++++++++++ test/test_cuda_u64_is_power_10.cu | 82 ++++++++++++++++++ test/test_cuda_u64_is_power_2.cu | 82 ++++++++++++++++++ test/test_cuda_u64_isqrt.cu | 82 ++++++++++++++++++ test/test_cuda_u64_next_multiple_of.cu | 85 ++++++++++++++++++ test/test_cuda_u64_remove_trailing_zeros.cu | 82 ++++++++++++++++++ test/test_cuda_u8_abs_diff.cu | 85 ++++++++++++++++++ test/test_cuda_u8_div_ceil.cu | 85 ++++++++++++++++++ test/test_cuda_u8_ilog.cu | 82 ++++++++++++++++++ test/test_cuda_u8_ilog10.cu | 82 ++++++++++++++++++ test/test_cuda_u8_ilog2.cu | 82 ++++++++++++++++++ test/test_cuda_u8_ipow.cu | 85 ++++++++++++++++++ test/test_cuda_u8_is_power_10.cu | 82 ++++++++++++++++++ test/test_cuda_u8_is_power_2.cu | 82 ++++++++++++++++++ test/test_cuda_u8_isqrt.cu | 82 ++++++++++++++++++ test/test_cuda_u8_next_multiple_of.cu | 85 ++++++++++++++++++ test/test_cuda_u8_remove_trailing_zeros.cu | 82 ++++++++++++++++++ 56 files changed, 4692 insertions(+) create mode 100644 test/test_cuda_u128_abs_diff.cu create mode 100644 test/test_cuda_u128_div_ceil.cu create mode 100644 test/test_cuda_u128_ilog.cu create mode 100644 test/test_cuda_u128_ilog10.cu create mode 100644 test/test_cuda_u128_ilog2.cu create mode 100644 test/test_cuda_u128_ipow.cu create mode 100644 test/test_cuda_u128_is_power_10.cu create mode 100644 test/test_cuda_u128_is_power_2.cu create mode 100644 test/test_cuda_u128_isqrt.cu create mode 100644 test/test_cuda_u128_next_multiple_of.cu create mode 100644 test/test_cuda_u128_remove_trailing_zeros.cu create mode 100644 test/test_cuda_u16_abs_diff.cu create mode 100644 test/test_cuda_u16_div_ceil.cu create mode 100644 test/test_cuda_u16_ilog.cu create mode 100644 test/test_cuda_u16_ilog10.cu create mode 100644 test/test_cuda_u16_ilog2.cu create mode 100644 test/test_cuda_u16_ipow.cu create mode 100644 test/test_cuda_u16_is_power_10.cu create mode 100644 test/test_cuda_u16_is_power_2.cu create mode 100644 test/test_cuda_u16_isqrt.cu create mode 100644 test/test_cuda_u16_next_multiple_of.cu create mode 100644 test/test_cuda_u16_remove_trailing_zeros.cu create mode 100644 test/test_cuda_u32_abs_diff.cu create mode 100644 test/test_cuda_u32_div_ceil.cu create mode 100644 test/test_cuda_u32_ilog.cu create mode 100644 test/test_cuda_u32_ilog10.cu create mode 100644 test/test_cuda_u32_ilog2.cu create mode 100644 test/test_cuda_u32_ipow.cu create mode 100644 test/test_cuda_u32_is_power_10.cu create mode 100644 test/test_cuda_u32_is_power_2.cu create mode 100644 test/test_cuda_u32_isqrt.cu create mode 100644 test/test_cuda_u32_next_multiple_of.cu create mode 100644 test/test_cuda_u32_remove_trailing_zeros.cu create mode 100644 test/test_cuda_u64_abs_diff.cu create mode 100644 test/test_cuda_u64_div_ceil.cu create mode 100644 test/test_cuda_u64_ilog.cu create mode 100644 test/test_cuda_u64_ilog10.cu create mode 100644 test/test_cuda_u64_ilog2.cu create mode 100644 test/test_cuda_u64_ipow.cu create mode 100644 test/test_cuda_u64_is_power_10.cu create mode 100644 test/test_cuda_u64_is_power_2.cu create mode 100644 test/test_cuda_u64_isqrt.cu create mode 100644 test/test_cuda_u64_next_multiple_of.cu create mode 100644 test/test_cuda_u64_remove_trailing_zeros.cu create mode 100644 test/test_cuda_u8_abs_diff.cu create mode 100644 test/test_cuda_u8_div_ceil.cu create mode 100644 test/test_cuda_u8_ilog.cu create mode 100644 test/test_cuda_u8_ilog10.cu create mode 100644 test/test_cuda_u8_ilog2.cu create mode 100644 test/test_cuda_u8_ipow.cu create mode 100644 test/test_cuda_u8_is_power_10.cu create mode 100644 test/test_cuda_u8_is_power_2.cu create mode 100644 test/test_cuda_u8_isqrt.cu create mode 100644 test/test_cuda_u8_next_multiple_of.cu create mode 100644 test/test_cuda_u8_remove_trailing_zeros.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index efc2d0f..4b30acc 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -211,3 +211,70 @@ run test_cuda_u64_charconv_all_bases.cu ; # u128 charconv tests run test_cuda_u128_charconv.cu ; run test_cuda_u128_charconv_all_bases.cu ; + +# Integer utilities tests + +# u8 integer utilities tests +run test_cuda_u8_isqrt.cu ; +run test_cuda_u8_remove_trailing_zeros.cu ; +run test_cuda_u8_is_power_10.cu ; +run test_cuda_u8_is_power_2.cu ; +run test_cuda_u8_ilog2.cu ; +run test_cuda_u8_ilog10.cu ; +run test_cuda_u8_ilog.cu ; +run test_cuda_u8_ipow.cu ; +run test_cuda_u8_abs_diff.cu ; +run test_cuda_u8_div_ceil.cu ; +run test_cuda_u8_next_multiple_of.cu ; + +# u16 integer utilities tests +run test_cuda_u16_isqrt.cu ; +run test_cuda_u16_remove_trailing_zeros.cu ; +run test_cuda_u16_is_power_10.cu ; +run test_cuda_u16_is_power_2.cu ; +run test_cuda_u16_ilog2.cu ; +run test_cuda_u16_ilog10.cu ; +run test_cuda_u16_ilog.cu ; +run test_cuda_u16_ipow.cu ; +run test_cuda_u16_abs_diff.cu ; +run test_cuda_u16_div_ceil.cu ; +run test_cuda_u16_next_multiple_of.cu ; + +# u32 integer utilities tests +run test_cuda_u32_isqrt.cu ; +run test_cuda_u32_remove_trailing_zeros.cu ; +run test_cuda_u32_is_power_10.cu ; +run test_cuda_u32_is_power_2.cu ; +run test_cuda_u32_ilog2.cu ; +run test_cuda_u32_ilog10.cu ; +run test_cuda_u32_ilog.cu ; +run test_cuda_u32_ipow.cu ; +run test_cuda_u32_abs_diff.cu ; +run test_cuda_u32_div_ceil.cu ; +run test_cuda_u32_next_multiple_of.cu ; + +# u64 integer utilities tests +run test_cuda_u64_isqrt.cu ; +run test_cuda_u64_remove_trailing_zeros.cu ; +run test_cuda_u64_is_power_10.cu ; +run test_cuda_u64_is_power_2.cu ; +run test_cuda_u64_ilog2.cu ; +run test_cuda_u64_ilog10.cu ; +run test_cuda_u64_ilog.cu ; +run test_cuda_u64_ipow.cu ; +run test_cuda_u64_abs_diff.cu ; +run test_cuda_u64_div_ceil.cu ; +run test_cuda_u64_next_multiple_of.cu ; + +# u128 integer utilities tests +run test_cuda_u128_isqrt.cu ; +run test_cuda_u128_remove_trailing_zeros.cu ; +run test_cuda_u128_is_power_10.cu ; +run test_cuda_u128_is_power_2.cu ; +run test_cuda_u128_ilog2.cu ; +run test_cuda_u128_ilog10.cu ; +run test_cuda_u128_ilog.cu ; +run test_cuda_u128_ipow.cu ; +run test_cuda_u128_abs_diff.cu ; +run test_cuda_u128_div_ceil.cu ; +run test_cuda_u128_next_multiple_of.cu ; diff --git a/test/test_cuda_u128_abs_diff.cu b/test/test_cuda_u128_abs_diff.cu new file mode 100644 index 0000000..fb863d7 --- /dev/null +++ b/test/test_cuda_u128_abs_diff.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + boost::random::uniform_int_distribution dist2{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_div_ceil.cu b/test/test_cuda_u128_div_ceil.cu new file mode 100644 index 0000000..a23045e --- /dev/null +++ b/test/test_cuda_u128_div_ceil.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + boost::random::uniform_int_distribution dist2{basis_type{1U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_ilog.cu b/test/test_cuda_u128_ilog.cu new file mode 100644 index 0000000..4b61aa3 --- /dev/null +++ b/test/test_cuda_u128_ilog.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast(7)}); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{1U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast(7)})); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_ilog10.cu b/test/test_cuda_u128_ilog10.cu new file mode 100644 index 0000000..7c2f731 --- /dev/null +++ b/test/test_cuda_u128_ilog10.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog10(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{1U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog10(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_ilog2.cu b/test/test_cuda_u128_ilog2.cu new file mode 100644 index 0000000..2764202 --- /dev/null +++ b/test/test_cuda_u128_ilog2.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog2(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{1U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog2(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_ipow.cu b/test/test_cuda_u128_ipow.cu new file mode 100644 index 0000000..14fe473 --- /dev/null +++ b/test/test_cuda_u128_ipow.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ipow(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + boost::random::uniform_int_distribution dist2{basis_type{0U}, basis_type{7U}}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_is_power_10.cu b/test/test_cuda_u128_is_power_10.cu new file mode 100644 index 0000000..5d7048f --- /dev/null +++ b/test/test_cuda_u128_is_power_10.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_10(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{1U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_10(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_is_power_2.cu b/test/test_cuda_u128_is_power_2.cu new file mode 100644 index 0000000..2775f07 --- /dev/null +++ b/test/test_cuda_u128_is_power_2.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_2(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_2(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_isqrt.cu b/test/test_cuda_u128_isqrt.cu new file mode 100644 index 0000000..a54b27f --- /dev/null +++ b/test/test_cuda_u128_isqrt.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::isqrt(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::isqrt(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_next_multiple_of.cu b/test/test_cuda_u128_next_multiple_of.cu new file mode 100644 index 0000000..313092a --- /dev/null +++ b/test/test_cuda_u128_next_multiple_of.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)() / basis_type{2U}}; + boost::random::uniform_int_distribution dist2{basis_type{1U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_remove_trailing_zeros.cu b/test/test_cuda_u128_remove_trailing_zeros.cu new file mode 100644 index 0000000..5ef04b7 --- /dev/null +++ b/test/test_cuda_u128_remove_trailing_zeros.cu @@ -0,0 +1,87 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number}; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{1U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number}); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_abs_diff.cu b/test/test_cuda_u16_abs_diff.cu new file mode 100644 index 0000000..830196a --- /dev/null +++ b/test/test_cuda_u16_abs_diff.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_div_ceil.cu b/test/test_cuda_u16_div_ceil.cu new file mode 100644 index 0000000..52013ad --- /dev/null +++ b/test/test_cuda_u16_div_ceil.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_ilog.cu b/test/test_cuda_u16_ilog.cu new file mode 100644 index 0000000..39bf5de --- /dev/null +++ b/test/test_cuda_u16_ilog.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast(7)}); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast(7)})); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_ilog10.cu b/test/test_cuda_u16_ilog10.cu new file mode 100644 index 0000000..3eff426 --- /dev/null +++ b/test/test_cuda_u16_ilog10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog10(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog10(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_ilog2.cu b/test/test_cuda_u16_ilog2.cu new file mode 100644 index 0000000..23696c4 --- /dev/null +++ b/test/test_cuda_u16_ilog2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog2(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog2(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_ipow.cu b/test/test_cuda_u16_ipow.cu new file mode 100644 index 0000000..a7c8363 --- /dev/null +++ b/test/test_cuda_u16_ipow.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ipow(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, 7U}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_is_power_10.cu b/test/test_cuda_u16_is_power_10.cu new file mode 100644 index 0000000..1d680da --- /dev/null +++ b/test/test_cuda_u16_is_power_10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_10(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_10(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_is_power_2.cu b/test/test_cuda_u16_is_power_2.cu new file mode 100644 index 0000000..e53ae2c --- /dev/null +++ b/test/test_cuda_u16_is_power_2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_2(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_2(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_isqrt.cu b/test/test_cuda_u16_isqrt.cu new file mode 100644 index 0000000..7f9708e --- /dev/null +++ b/test/test_cuda_u16_isqrt.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::isqrt(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::isqrt(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_next_multiple_of.cu b/test/test_cuda_u16_next_multiple_of.cu new file mode 100644 index 0000000..f597589 --- /dev/null +++ b/test/test_cuda_u16_next_multiple_of.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)()) / 2U}; + std::uniform_int_distribution dist2{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_remove_trailing_zeros.cu b/test/test_cuda_u16_remove_trailing_zeros.cu new file mode 100644 index 0000000..85559e2 --- /dev/null +++ b/test/test_cuda_u16_remove_trailing_zeros.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number}; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number}); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_abs_diff.cu b/test/test_cuda_u32_abs_diff.cu new file mode 100644 index 0000000..213bfe8 --- /dev/null +++ b/test/test_cuda_u32_abs_diff.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_div_ceil.cu b/test/test_cuda_u32_div_ceil.cu new file mode 100644 index 0000000..83def48 --- /dev/null +++ b/test/test_cuda_u32_div_ceil.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_ilog.cu b/test/test_cuda_u32_ilog.cu new file mode 100644 index 0000000..b98cd7b --- /dev/null +++ b/test/test_cuda_u32_ilog.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast(7)}); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast(7)})); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_ilog10.cu b/test/test_cuda_u32_ilog10.cu new file mode 100644 index 0000000..9302d56 --- /dev/null +++ b/test/test_cuda_u32_ilog10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog10(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog10(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_ilog2.cu b/test/test_cuda_u32_ilog2.cu new file mode 100644 index 0000000..85b2e9d --- /dev/null +++ b/test/test_cuda_u32_ilog2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog2(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog2(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_ipow.cu b/test/test_cuda_u32_ipow.cu new file mode 100644 index 0000000..73a623a --- /dev/null +++ b/test/test_cuda_u32_ipow.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ipow(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, basis_type{7}}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_is_power_10.cu b/test/test_cuda_u32_is_power_10.cu new file mode 100644 index 0000000..3d8b03c --- /dev/null +++ b/test/test_cuda_u32_is_power_10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_10(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_10(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_is_power_2.cu b/test/test_cuda_u32_is_power_2.cu new file mode 100644 index 0000000..381e674 --- /dev/null +++ b/test/test_cuda_u32_is_power_2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_2(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_2(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_isqrt.cu b/test/test_cuda_u32_isqrt.cu new file mode 100644 index 0000000..a6fcb8c --- /dev/null +++ b/test/test_cuda_u32_isqrt.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::isqrt(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::isqrt(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_next_multiple_of.cu b/test/test_cuda_u32_next_multiple_of.cu new file mode 100644 index 0000000..3371948 --- /dev/null +++ b/test/test_cuda_u32_next_multiple_of.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)() / basis_type{2}}; + std::uniform_int_distribution dist2{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_remove_trailing_zeros.cu b/test/test_cuda_u32_remove_trailing_zeros.cu new file mode 100644 index 0000000..1e8c9e4 --- /dev/null +++ b/test/test_cuda_u32_remove_trailing_zeros.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number}; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number}); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_abs_diff.cu b/test/test_cuda_u64_abs_diff.cu new file mode 100644 index 0000000..038f420 --- /dev/null +++ b/test/test_cuda_u64_abs_diff.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_div_ceil.cu b/test/test_cuda_u64_div_ceil.cu new file mode 100644 index 0000000..4bd8b1b --- /dev/null +++ b/test/test_cuda_u64_div_ceil.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_ilog.cu b/test/test_cuda_u64_ilog.cu new file mode 100644 index 0000000..430ae89 --- /dev/null +++ b/test/test_cuda_u64_ilog.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast(7)}); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast(7)})); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_ilog10.cu b/test/test_cuda_u64_ilog10.cu new file mode 100644 index 0000000..2c67863 --- /dev/null +++ b/test/test_cuda_u64_ilog10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog10(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog10(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_ilog2.cu b/test/test_cuda_u64_ilog2.cu new file mode 100644 index 0000000..375c119 --- /dev/null +++ b/test/test_cuda_u64_ilog2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog2(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog2(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_ipow.cu b/test/test_cuda_u64_ipow.cu new file mode 100644 index 0000000..a2c39e1 --- /dev/null +++ b/test/test_cuda_u64_ipow.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ipow(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, basis_type{7}}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_is_power_10.cu b/test/test_cuda_u64_is_power_10.cu new file mode 100644 index 0000000..0f2fb55 --- /dev/null +++ b/test/test_cuda_u64_is_power_10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_10(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_10(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_is_power_2.cu b/test/test_cuda_u64_is_power_2.cu new file mode 100644 index 0000000..c823c6f --- /dev/null +++ b/test/test_cuda_u64_is_power_2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_2(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_2(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_isqrt.cu b/test/test_cuda_u64_isqrt.cu new file mode 100644 index 0000000..ba5a5ae --- /dev/null +++ b/test/test_cuda_u64_isqrt.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::isqrt(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::isqrt(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_next_multiple_of.cu b/test/test_cuda_u64_next_multiple_of.cu new file mode 100644 index 0000000..fd6f1a7 --- /dev/null +++ b/test/test_cuda_u64_next_multiple_of.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)() / basis_type{2}}; + std::uniform_int_distribution dist2{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_remove_trailing_zeros.cu b/test/test_cuda_u64_remove_trailing_zeros.cu new file mode 100644 index 0000000..0a58876 --- /dev/null +++ b/test/test_cuda_u64_remove_trailing_zeros.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number}; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{dist(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number}); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_abs_diff.cu b/test/test_cuda_u8_abs_diff.cu new file mode 100644 index 0000000..3d0e736 --- /dev/null +++ b/test/test_cuda_u8_abs_diff.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::abs_diff(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::abs_diff(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_div_ceil.cu b/test/test_cuda_u8_div_ceil.cu new file mode 100644 index 0000000..9a4d4f2 --- /dev/null +++ b/test/test_cuda_u8_div_ceil.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::div_ceil(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::div_ceil(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_ilog.cu b/test/test_cuda_u8_ilog.cu new file mode 100644 index 0000000..8fcdce1 --- /dev/null +++ b/test/test_cuda_u8_ilog.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog(in[i], test_type{static_cast(7)}); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog(input_vector[i], test_type{static_cast(7)})); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_ilog10.cu b/test/test_cuda_u8_ilog10.cu new file mode 100644 index 0000000..3bde939 --- /dev/null +++ b/test/test_cuda_u8_ilog10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog10(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog10(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_ilog2.cu b/test/test_cuda_u8_ilog2.cu new file mode 100644 index 0000000..adf4094 --- /dev/null +++ b/test/test_cuda_u8_ilog2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ilog2(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ilog2(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_ipow.cu b/test/test_cuda_u8_ipow.cu new file mode 100644 index 0000000..22232da --- /dev/null +++ b/test/test_cuda_u8_ipow.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::ipow(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, 7U}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::ipow(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_is_power_10.cu b/test/test_cuda_u8_is_power_10.cu new file mode 100644 index 0000000..7cee6aa --- /dev/null +++ b/test/test_cuda_u8_is_power_10.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_10(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_10(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_is_power_2.cu b/test/test_cuda_u8_is_power_2.cu new file mode 100644 index 0000000..4405f2d --- /dev/null +++ b/test/test_cuda_u8_is_power_2.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, int *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = static_cast(boost::safe_numbers::is_power_2(in[i])); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(static_cast(boost::safe_numbers::is_power_2(input_vector[i]))); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_isqrt.cu b/test/test_cuda_u8_isqrt.cu new file mode 100644 index 0000000..3c68a88 --- /dev/null +++ b/test/test_cuda_u8_isqrt.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::isqrt(in[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::isqrt(input_vector[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_next_multiple_of.cu b/test/test_cuda_u8_next_multiple_of.cu new file mode 100644 index 0000000..d5a95d3 --- /dev/null +++ b/test/test_cuda_u8_next_multiple_of.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::next_multiple_of(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)()) / 2U}; + std::uniform_int_distribution dist2{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::next_multiple_of(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_remove_trailing_zeros.cu b/test/test_cuda_u8_remove_trailing_zeros.cu new file mode 100644 index 0000000..f115383 --- /dev/null +++ b/test/test_cuda_u8_remove_trailing_zeros.cu @@ -0,0 +1,82 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = test_type{boost::safe_numbers::remove_trailing_zeros(in[i]).trimmed_number}; + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector[i] = test_type{static_cast(dist(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(test_type{boost::safe_numbers::remove_trailing_zeros(input_vector[i]).trimmed_number}); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 9ad5573d0c1a40a01853eec33ea037c0bccb90bd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:57:16 -0400 Subject: [PATCH 11/36] Fix overflow in tests --- test/test_cuda_u128_ipow.cu | 4 ++-- test/test_cuda_u16_ipow.cu | 4 ++-- test/test_cuda_u32_ipow.cu | 4 ++-- test/test_cuda_u64_ipow.cu | 4 ++-- test/test_cuda_u8_ipow.cu | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/test_cuda_u128_ipow.cu b/test/test_cuda_u128_ipow.cu index 14fe473..4caada7 100644 --- a/test/test_cuda_u128_ipow.cu +++ b/test/test_cuda_u128_ipow.cu @@ -45,8 +45,8 @@ int main(void) cuda_managed_ptr input_vector2(numElements); cuda_managed_ptr output_vector(numElements); - boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; - boost::random::uniform_int_distribution dist2{basis_type{0U}, basis_type{7U}}; + boost::random::uniform_int_distribution dist{basis_type{0U}, basis_type{10U}}; + boost::random::uniform_int_distribution dist2{basis_type{0U}, basis_type{2U}}; for (int i = 0; i < numElements; ++i) { input_vector1[i] = test_type{dist(rng)}; diff --git a/test/test_cuda_u16_ipow.cu b/test/test_cuda_u16_ipow.cu index a7c8363..a45070a 100644 --- a/test/test_cuda_u16_ipow.cu +++ b/test/test_cuda_u16_ipow.cu @@ -40,8 +40,8 @@ int main(void) cuda_managed_ptr input_vector2(numElements); cuda_managed_ptr output_vector(numElements); - std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; - std::uniform_int_distribution dist2{0U, 7U}; + std::uniform_int_distribution dist{0U, 10U}; + std::uniform_int_distribution dist2{0U, 2U}; for (int i = 0; i < numElements; ++i) { input_vector1[i] = test_type{static_cast(dist(rng))}; diff --git a/test/test_cuda_u32_ipow.cu b/test/test_cuda_u32_ipow.cu index 73a623a..e8c1f1d 100644 --- a/test/test_cuda_u32_ipow.cu +++ b/test/test_cuda_u32_ipow.cu @@ -40,8 +40,8 @@ int main(void) cuda_managed_ptr input_vector2(numElements); cuda_managed_ptr output_vector(numElements); - std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; - std::uniform_int_distribution dist2{basis_type{0}, basis_type{7}}; + std::uniform_int_distribution dist{basis_type{0}, basis_type{10}}; + std::uniform_int_distribution dist2{basis_type{0}, basis_type{2}}; for (int i = 0; i < numElements; ++i) { input_vector1[i] = test_type{dist(rng)}; diff --git a/test/test_cuda_u64_ipow.cu b/test/test_cuda_u64_ipow.cu index a2c39e1..be09471 100644 --- a/test/test_cuda_u64_ipow.cu +++ b/test/test_cuda_u64_ipow.cu @@ -40,8 +40,8 @@ int main(void) cuda_managed_ptr input_vector2(numElements); cuda_managed_ptr output_vector(numElements); - std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; - std::uniform_int_distribution dist2{basis_type{0}, basis_type{7}}; + std::uniform_int_distribution dist{basis_type{0}, basis_type{10}}; + std::uniform_int_distribution dist2{basis_type{0}, basis_type{2}}; for (int i = 0; i < numElements; ++i) { input_vector1[i] = test_type{dist(rng)}; diff --git a/test/test_cuda_u8_ipow.cu b/test/test_cuda_u8_ipow.cu index 22232da..ca643a8 100644 --- a/test/test_cuda_u8_ipow.cu +++ b/test/test_cuda_u8_ipow.cu @@ -40,8 +40,8 @@ int main(void) cuda_managed_ptr input_vector2(numElements); cuda_managed_ptr output_vector(numElements); - std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; - std::uniform_int_distribution dist2{0U, 7U}; + std::uniform_int_distribution dist{0U, 10U}; + std::uniform_int_distribution dist2{0U, 2U}; for (int i = 0; i < numElements; ++i) { input_vector1[i] = test_type{static_cast(dist(rng))}; From 71e2f37280510b6e89e61e07da2f1b6d09597231 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 10:57:35 -0400 Subject: [PATCH 12/36] On CUDA move tables into functions --- .../boost/safe_numbers/detail/num_digits.hpp | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/boost/safe_numbers/detail/num_digits.hpp b/include/boost/safe_numbers/detail/num_digits.hpp index c35831d..70cf078 100644 --- a/include/boost/safe_numbers/detail/num_digits.hpp +++ b/include/boost/safe_numbers/detail/num_digits.hpp @@ -41,10 +41,14 @@ consteval auto make_powers_of_10() noexcept return table; } +#if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + inline constexpr auto powers_of_10_u32 {make_powers_of_10()}; inline constexpr auto powers_of_10_u64 {make_powers_of_10()}; inline constexpr auto powers_of_10_u128 {make_powers_of_10()}; +#endif + // ============================================================================ // num_digits: counts the number of decimal digits using MSB approximation // @@ -58,6 +62,12 @@ template requires (std::numeric_limits::digits <= 32 && std::is_unsigned_v) constexpr auto num_digits(const T init_x) noexcept -> int { + #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + constexpr auto powers_of_10_u32 {make_powers_of_10()}; + + #endif + const auto x {static_cast(init_x)}; if (x == 0) @@ -86,6 +96,12 @@ constexpr auto num_digits(const T init_x) noexcept -> int // Overload for uint64_t constexpr auto num_digits(const std::uint64_t x) noexcept -> int { + #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + constexpr auto powers_of_10_u64 {make_powers_of_10()}; + + #endif + if (x <= UINT32_MAX) { return num_digits(static_cast(x)); @@ -112,6 +128,12 @@ constexpr auto num_digits(const std::uint64_t x) noexcept -> int // Overload for uint128_t constexpr auto num_digits(const boost::int128::uint128_t& x) noexcept -> int { + #if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + + constexpr auto powers_of_10_u128 {make_powers_of_10()}; + + #endif + if (x.high == UINT64_C(0)) { return num_digits(x.low); From 0b8d781285f374ec5b6cfa364c1f164c2756e29a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 11:03:34 -0400 Subject: [PATCH 13/36] Use CUDA bit functions when on device --- .../boost/safe_numbers/detail/num_digits.hpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/boost/safe_numbers/detail/num_digits.hpp b/include/boost/safe_numbers/detail/num_digits.hpp index 70cf078..6c6caae 100644 --- a/include/boost/safe_numbers/detail/num_digits.hpp +++ b/include/boost/safe_numbers/detail/num_digits.hpp @@ -13,11 +13,20 @@ #ifndef BOOST_SAFE_NUMBERS_BUILD_MODULE -#include #include #include #include +#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + +#include + +#else + +#include + +#endif + #endif namespace boost::safe_numbers::detail { @@ -75,7 +84,11 @@ constexpr auto num_digits(const T init_x) noexcept -> int return 1; } + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) const auto msb {32 - boost::core::countl_zero(x)}; + #else + const auto msb {32 - cuda::std::countl_zero(x)}; + #endif // Approximate log10 const auto estimated_digits {(msb * 1000) / 3322 + 1}; @@ -107,7 +120,11 @@ constexpr auto num_digits(const std::uint64_t x) noexcept -> int return num_digits(static_cast(x)); } + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) const auto msb {64 - boost::core::countl_zero(x)}; + #else + const auto msb {64 - cuda::std::countl_zero(x)}; + #endif // Approximate log10 const auto estimated_digits {(msb * 1000) / 3322 + 1}; From b09c97dcd9ab1bc74b7aa90a39bb468e1bd93ab7 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 11:11:30 -0400 Subject: [PATCH 14/36] Add CUDA support for numeric functions --- include/boost/safe_numbers/numeric.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/boost/safe_numbers/numeric.hpp b/include/boost/safe_numbers/numeric.hpp index dd5e1a0..73b5217 100644 --- a/include/boost/safe_numbers/numeric.hpp +++ b/include/boost/safe_numbers/numeric.hpp @@ -17,7 +17,7 @@ namespace boost::safe_numbers { template -[[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T { using underlying_type = detail::underlying_type_t; @@ -32,7 +32,7 @@ template } template -[[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T { using underlying_type = detail::underlying_type_t; @@ -47,7 +47,7 @@ template } template -[[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T { using underlying_type = detail::underlying_type_t; From e13f8d345694a1c24c96f935d80c22ea0489074c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 11:26:27 -0400 Subject: [PATCH 15/36] Add CUDA testing of numeric functions --- test/cuda_jamfile | 27 ++++++++++ test/test_cuda_u128_gcd.cu | 90 +++++++++++++++++++++++++++++++++ test/test_cuda_u128_lcm.cu | 90 +++++++++++++++++++++++++++++++++ test/test_cuda_u128_midpoint.cu | 90 +++++++++++++++++++++++++++++++++ test/test_cuda_u16_gcd.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u16_lcm.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u16_midpoint.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u32_gcd.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u32_lcm.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u32_midpoint.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u64_gcd.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u64_lcm.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u64_midpoint.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u8_gcd.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u8_lcm.cu | 85 +++++++++++++++++++++++++++++++ test/test_cuda_u8_midpoint.cu | 85 +++++++++++++++++++++++++++++++ 16 files changed, 1317 insertions(+) create mode 100644 test/test_cuda_u128_gcd.cu create mode 100644 test/test_cuda_u128_lcm.cu create mode 100644 test/test_cuda_u128_midpoint.cu create mode 100644 test/test_cuda_u16_gcd.cu create mode 100644 test/test_cuda_u16_lcm.cu create mode 100644 test/test_cuda_u16_midpoint.cu create mode 100644 test/test_cuda_u32_gcd.cu create mode 100644 test/test_cuda_u32_lcm.cu create mode 100644 test/test_cuda_u32_midpoint.cu create mode 100644 test/test_cuda_u64_gcd.cu create mode 100644 test/test_cuda_u64_lcm.cu create mode 100644 test/test_cuda_u64_midpoint.cu create mode 100644 test/test_cuda_u8_gcd.cu create mode 100644 test/test_cuda_u8_lcm.cu create mode 100644 test/test_cuda_u8_midpoint.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 4b30acc..77be636 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -278,3 +278,30 @@ run test_cuda_u128_ipow.cu ; run test_cuda_u128_abs_diff.cu ; run test_cuda_u128_div_ceil.cu ; run test_cuda_u128_next_multiple_of.cu ; + +# Numeric tests + +# u8 numeric tests +run test_cuda_u8_gcd.cu ; +run test_cuda_u8_lcm.cu ; +run test_cuda_u8_midpoint.cu ; + +# u16 numeric tests +run test_cuda_u16_gcd.cu ; +run test_cuda_u16_lcm.cu ; +run test_cuda_u16_midpoint.cu ; + +# u32 numeric tests +run test_cuda_u32_gcd.cu ; +run test_cuda_u32_lcm.cu ; +run test_cuda_u32_midpoint.cu ; + +# u64 numeric tests +run test_cuda_u64_gcd.cu ; +run test_cuda_u64_lcm.cu ; +run test_cuda_u64_midpoint.cu ; + +# u128 numeric tests +run test_cuda_u128_gcd.cu ; +run test_cuda_u128_lcm.cu ; +run test_cuda_u128_midpoint.cu ; diff --git a/test/test_cuda_u128_gcd.cu b/test/test_cuda_u128_gcd.cu new file mode 100644 index 0000000..d39fb2d --- /dev/null +++ b/test/test_cuda_u128_gcd.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::gcd(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + boost::random::uniform_int_distribution dist2{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_lcm.cu b/test/test_cuda_u128_lcm.cu new file mode 100644 index 0000000..ba04d7e --- /dev/null +++ b/test/test_cuda_u128_lcm.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::lcm(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{1U}, basis_type{10U}}; + boost::random::uniform_int_distribution dist2{basis_type{1U}, basis_type{10U}}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u128_midpoint.cu b/test/test_cuda_u128_midpoint.cu new file mode 100644 index 0000000..00ea8d2 --- /dev/null +++ b/test/test_cuda_u128_midpoint.cu @@ -0,0 +1,90 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_SAFE_NUMBERS_DETAIL_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u128; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist{basis_type{0U}, (std::numeric_limits::max)()}; + boost::random::uniform_int_distribution dist2{basis_type{0U}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_gcd.cu b/test/test_cuda_u16_gcd.cu new file mode 100644 index 0000000..0bff419 --- /dev/null +++ b/test/test_cuda_u16_gcd.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::gcd(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_lcm.cu b/test/test_cuda_u16_lcm.cu new file mode 100644 index 0000000..2f2f420 --- /dev/null +++ b/test/test_cuda_u16_lcm.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::lcm(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, 10U}; + std::uniform_int_distribution dist2{1U, 10U}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u16_midpoint.cu b/test/test_cuda_u16_midpoint.cu new file mode 100644 index 0000000..136056b --- /dev/null +++ b/test/test_cuda_u16_midpoint.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u16; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_gcd.cu b/test/test_cuda_u32_gcd.cu new file mode 100644 index 0000000..0ca6178 --- /dev/null +++ b/test/test_cuda_u32_gcd.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::gcd(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_lcm.cu b/test/test_cuda_u32_lcm.cu new file mode 100644 index 0000000..09019b9 --- /dev/null +++ b/test/test_cuda_u32_lcm.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::lcm(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, basis_type{10}}; + std::uniform_int_distribution dist2{basis_type{1}, basis_type{10}}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u32_midpoint.cu b/test/test_cuda_u32_midpoint.cu new file mode 100644 index 0000000..aadfcac --- /dev/null +++ b/test/test_cuda_u32_midpoint.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_gcd.cu b/test/test_cuda_u64_gcd.cu new file mode 100644 index 0000000..6d0dc83 --- /dev/null +++ b/test/test_cuda_u64_gcd.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::gcd(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_lcm.cu b/test/test_cuda_u64_lcm.cu new file mode 100644 index 0000000..24dce83 --- /dev/null +++ b/test/test_cuda_u64_lcm.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::lcm(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{1}, basis_type{10}}; + std::uniform_int_distribution dist2{basis_type{1}, basis_type{10}}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u64_midpoint.cu b/test/test_cuda_u64_midpoint.cu new file mode 100644 index 0000000..bba9451 --- /dev/null +++ b/test/test_cuda_u64_midpoint.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u64; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{basis_type{0}, (std::numeric_limits::max)()}; + std::uniform_int_distribution dist2{basis_type{0}, (std::numeric_limits::max)()}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{dist(rng)}; + input_vector2[i] = test_type{dist2(rng)}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_gcd.cu b/test/test_cuda_u8_gcd.cu new file mode 100644 index 0000000..3ab02ee --- /dev/null +++ b/test/test_cuda_u8_gcd.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::gcd(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::gcd(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_lcm.cu b/test/test_cuda_u8_lcm.cu new file mode 100644 index 0000000..636263b --- /dev/null +++ b/test/test_cuda_u8_lcm.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::lcm(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{1U, 10U}; + std::uniform_int_distribution dist2{1U, 10U}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::lcm(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_cuda_u8_midpoint.cu b/test/test_cuda_u8_midpoint.cu new file mode 100644 index 0000000..37e2a76 --- /dev/null +++ b/test/test_cuda_u8_midpoint.cu @@ -0,0 +1,85 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::safe_numbers::u8; +using basis_type = test_type::basis_type; + +__global__ void cuda_test(const test_type *in1, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::safe_numbers::midpoint(in1[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng{42}; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector1(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + std::uniform_int_distribution dist{0U, static_cast((std::numeric_limits::max)())}; + std::uniform_int_distribution dist2{0U, static_cast((std::numeric_limits::max)())}; + for (int i = 0; i < numElements; ++i) + { + input_vector1[i] = test_type{static_cast(dist(rng))}; + input_vector2[i] = test_type{static_cast(dist2(rng))}; + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + boost::safe_numbers::device_error_context ctx; + watch w; + + cuda_test<<>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements); + ctx.synchronize(); + + std::cout << "CUDA kernel done in: " << w.elapsed() << "s" << std::endl; + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::safe_numbers::midpoint(input_vector1[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From f15b9f1fc9e745c51c832e9a345c6f66c990f9ae Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 11:31:29 -0400 Subject: [PATCH 16/36] Use cuda::std:: when compiling on device --- include/boost/safe_numbers/numeric.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/boost/safe_numbers/numeric.hpp b/include/boost/safe_numbers/numeric.hpp index 73b5217..cfb95d9 100644 --- a/include/boost/safe_numbers/numeric.hpp +++ b/include/boost/safe_numbers/numeric.hpp @@ -10,7 +10,11 @@ #ifndef BOOST_SAFE_NUMBERS_BUILD_MODULE +#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) +#include +#else #include +#endif #endif @@ -27,7 +31,11 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto gcd(const T m, const } else { + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) return T{static_cast(std::gcd(static_cast(m), static_cast(n)))}; + #else + return T{static_cast(cuda::std::gcd(static_cast(m), static_cast(n)))}; + #endif } } @@ -42,7 +50,11 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto lcm(const T m, const } else { + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) return T{static_cast(std::lcm(static_cast(m), static_cast(n)))}; + #else + return T{static_cast(cuda::std::lcm(static_cast(m), static_cast(n)))}; + #endif } } @@ -57,7 +69,11 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto midpoint(const T a, } else { + #if !(defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) return T{static_cast(std::midpoint(static_cast(a), static_cast(b)))}; + #else + return T{static_cast(cuda::std::midpoint(static_cast(a), static_cast(b)))}; + #endif } } From afa19caf68502a46b174b3718df1fcc171291f66 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 11:40:34 -0400 Subject: [PATCH 17/36] Add example of CUDA usage --- examples/cuda.cu | 224 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 examples/cuda.cu diff --git a/examples/cuda.cu b/examples/cuda.cu new file mode 100644 index 0000000..eb0a63c --- /dev/null +++ b/examples/cuda.cu @@ -0,0 +1,224 @@ +// Copyright 2026 Matt Borland +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +// All safe_numbers types and free functions are annotated with __host__ __device__, +// so they work identically on both host and device. + +__global__ void arithmetic_kernel(const test_type* a, const test_type* b, test_type* out, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + { + // Basic arithmetic with overflow detection works on device + out[i] = a[i] + b[i]; + } +} + +__global__ void bit_kernel(const test_type* in, int* out, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + { + // All free functions work on device + out[i] = boost::safe_numbers::popcount(in[i]); + } +} + +__global__ void utility_kernel(const test_type* in, test_type* out, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + { + // Integer utilities work on device + out[i] = boost::safe_numbers::isqrt(in[i]); + } +} + +__global__ void numeric_kernel(const test_type* a, const test_type* b, test_type* out, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + { + // gcd, lcm, midpoint work on device + out[i] = boost::safe_numbers::gcd(a[i], b[i]); + } +} + +__global__ void charconv_kernel(const test_type* in, test_type* out, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + { + // charconv round-trip on device + char buf[16] {}; + auto tc = boost::charconv::to_chars(buf, buf + sizeof(buf), in[i]); + test_type parsed {}; + boost::charconv::from_chars(buf, tc.ptr, parsed); + out[i] = parsed; + } +} + +// Helper: allocate CUDA managed memory +void allocate(void** ptr, std::size_t bytes) +{ + cudaError_t err = cudaMallocManaged(ptr, bytes); + if (err != cudaSuccess) + { + throw std::runtime_error(cudaGetErrorString(err)); + } + cudaDeviceSynchronize(); +} + +template +void cleanup(T** ptr) +{ + if (*ptr != nullptr) + { + cudaFree(*ptr); + *ptr = nullptr; + } +} + +int main() +{ + const int n = 10000; + const int threadsPerBlock = 256; + const int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; + + std::mt19937_64 rng {42}; + std::uniform_int_distribution dist {basis_type{1}, (std::numeric_limits::max)() / basis_type{2}}; + + // --- Allocate managed arrays --- + + test_type* a = nullptr; + test_type* b = nullptr; + test_type* out_tt = nullptr; + int* out_int = nullptr; + + allocate(reinterpret_cast(&a), n * sizeof(test_type)); + allocate(reinterpret_cast(&b), n * sizeof(test_type)); + allocate(reinterpret_cast(&out_tt), n * sizeof(test_type)); + allocate(reinterpret_cast(&out_int), n * sizeof(int)); + + for (int i = 0; i < n; ++i) + { + a[i] = test_type{dist(rng)}; + b[i] = test_type{dist(rng)}; + } + + // The device_error_context captures any overflow/underflow errors + // reported from device code and rethrows them on the host. + boost::safe_numbers::device_error_context ctx; + + // --- Test 1: Arithmetic (a + b, using half-range to avoid overflow) --- + + arithmetic_kernel<<>>(a, b, out_tt, n); + ctx.synchronize(); + + bool pass = true; + for (int i = 0; i < n; ++i) + { + if (out_tt[i] != a[i] + b[i]) + { + pass = false; + break; + } + } + std::cout << "Arithmetic (add): " << (pass ? "PASSED" : "FAILED") << '\n'; + + // --- Test 2: Bit functions (popcount) --- + + bit_kernel<<>>(a, out_int, n); + ctx.synchronize(); + + pass = true; + for (int i = 0; i < n; ++i) + { + if (out_int[i] != boost::safe_numbers::popcount(a[i])) + { + pass = false; + break; + } + } + std::cout << "Bit (popcount): " << (pass ? "PASSED" : "FAILED") << '\n'; + + // --- Test 3: Integer utilities (isqrt) --- + + utility_kernel<<>>(a, out_tt, n); + ctx.synchronize(); + + pass = true; + for (int i = 0; i < n; ++i) + { + if (out_tt[i] != boost::safe_numbers::isqrt(a[i])) + { + pass = false; + break; + } + } + std::cout << "Utility (isqrt): " << (pass ? "PASSED" : "FAILED") << '\n'; + + // --- Test 4: Numeric (gcd) --- + + numeric_kernel<<>>(a, b, out_tt, n); + ctx.synchronize(); + + pass = true; + for (int i = 0; i < n; ++i) + { + if (out_tt[i] != boost::safe_numbers::gcd(a[i], b[i])) + { + pass = false; + break; + } + } + std::cout << "Numeric (gcd): " << (pass ? "PASSED" : "FAILED") << '\n'; + + // --- Test 5: Charconv round-trip --- + + charconv_kernel<<>>(a, out_tt, n); + ctx.synchronize(); + + pass = true; + for (int i = 0; i < n; ++i) + { + if (out_tt[i] != a[i]) + { + pass = false; + break; + } + } + std::cout << "Charconv (rt): " << (pass ? "PASSED" : "FAILED") << '\n'; + + // --- Cleanup --- + + cleanup(&a); + cleanup(&b); + cleanup(&out_tt); + cleanup(&out_int); + cudaDeviceReset(); + + return 0; +} From 456ba65292136ebb4d589c2309a17e74aa7b13c9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 11:42:22 -0400 Subject: [PATCH 18/36] Add example of failure and how to use error context --- examples/cuda_error_handling.cu | 119 ++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 examples/cuda_error_handling.cu diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu new file mode 100644 index 0000000..08f6eab --- /dev/null +++ b/examples/cuda_error_handling.cu @@ -0,0 +1,119 @@ +// Copyright 2026 Matt Borland +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +// This example demonstrates how to catch arithmetic errors that occur +// on a CUDA device using device_error_context. When a safe_numbers +// operation overflows on the GPU, the error is captured in managed +// memory and rethrown with BOOST_THROW_EXCEPTION on the host when +// you call ctx.synchronize(). + +#include +#include +#include +#include + +#include + +using test_type = boost::safe_numbers::u32; +using basis_type = test_type::basis_type; + +// This kernel deliberately overflows: it adds 1 to the maximum u32 value +__global__ void overflow_kernel(test_type* out) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i == 0) + { + const test_type max_val {(std::numeric_limits::max)()}; + out[0] = max_val + test_type{1}; // Overflow! + } +} + +// This kernel performs valid arithmetic +__global__ void safe_kernel(const test_type* in, test_type* out, int n) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + { + out[i] = in[i] + test_type{1}; + } +} + +int main() +{ + test_type* data = nullptr; + test_type* result = nullptr; + + cudaMallocManaged(&data, 4 * sizeof(test_type)); + cudaMallocManaged(&result, 4 * sizeof(test_type)); + cudaDeviceSynchronize(); + + // --------------------------------------------------------------- + // Step 1: Demonstrate catching a device-side overflow + // --------------------------------------------------------------- + + // Create a device_error_context. The constructor clears any + // stale error state from previous kernel launches. + boost::safe_numbers::device_error_context ctx; + + std::cout << "=== Launching kernel that overflows ===" << std::endl; + + overflow_kernel<<<1, 1>>>(result); + + // synchronize() does three things: + // 1. Calls cudaDeviceSynchronize() to wait for the kernel + // 2. Reads the error state from managed memory + // 3. Throws the appropriate std::exception if an error was captured + try + { + ctx.synchronize(); + std::cout << "No error detected (unexpected)" << std::endl; + } + catch (const std::overflow_error& e) + { + std::cout << "Caught overflow_error: " << e.what() << std::endl; + } + + // --------------------------------------------------------------- + // Step 2: After catching the error, the context is automatically + // reset. You can reuse it for the next kernel launch. + // --------------------------------------------------------------- + + std::cout << "\n=== Launching kernel with valid arithmetic ===" << std::endl; + + data[0] = test_type{10}; + data[1] = test_type{20}; + data[2] = test_type{30}; + data[3] = test_type{40}; + + safe_kernel<<<1, 4>>>(data, result, 4); + + try + { + ctx.synchronize(); + std::cout << "No error detected (expected)" << std::endl; + } + catch (const std::exception& e) + { + std::cout << "Unexpected error: " << e.what() << std::endl; + } + + // Verify results + for (int i = 0; i < 4; ++i) + { + std::cout << "result[" << i << "] = " + << static_cast(result[i]) << std::endl; + } + + // --------------------------------------------------------------- + // Cleanup + // --------------------------------------------------------------- + + cudaFree(data); + cudaFree(result); + cudaDeviceReset(); + + return 0; +} From c9765322ecc6d34d6f32829a6a1d25a2bfd25d57 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 11:46:11 -0400 Subject: [PATCH 19/36] Add examples to CUDA Jamfile --- test/cuda_jamfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/cuda_jamfile b/test/cuda_jamfile index 77be636..667f088 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -305,3 +305,7 @@ run test_cuda_u64_midpoint.cu ; run test_cuda_u128_gcd.cu ; run test_cuda_u128_lcm.cu ; run test_cuda_u128_midpoint.cu ; + +# Examples +run ../examples/cuda.cu ; +run ../examples/cuda_error_handling.cu ; From c9153759fa6cfa5180125f09632db76ab222eec4 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 12:00:27 -0400 Subject: [PATCH 20/36] Make the global a pointer because trap will destroy all managed memory --- .../detail/cuda_error_reporting.hpp | 141 +++++++++++++----- 1 file changed, 102 insertions(+), 39 deletions(-) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index c7e144e..596223e 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -87,9 +87,13 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE inline void copy_to_buf(char* dst, const char* sr #ifdef __CUDACC__ -// __managed__ places this in unified memory so the host can read it directly -// without cudaMemcpyFromSymbol, which fails after __trap() corrupts the device context -__managed__ cuda_device_error g_device_error = {0, 0, 0, exception_type::unknown, {'\0'}, {'\0'}}; +// __managed__ pointer to dynamically allocated managed memory. +// Using a pointer (rather than a __managed__ struct) lets us free and +// re-allocate after cudaDeviceReset(), which is required to recover +// from __trap() corrupting the device context. +// The pointer itself lives in managed memory so device code can +// dereference it directly. +__managed__ cuda_device_error* g_device_error = nullptr; __host__ __device__ inline void report_device_error( exception_type exc, @@ -99,14 +103,14 @@ __host__ __device__ inline void report_device_error( { #ifdef __CUDA_ARCH__ - if (atomicCAS(&g_device_error.flag, 0, 1) == 0) + if (g_device_error != nullptr && atomicCAS(&g_device_error->flag, 0, 1) == 0) { - g_device_error.line = line; - g_device_error.thread_id = blockIdx.x * blockDim.x + threadIdx.x; - g_device_error.exception = exc; + g_device_error->line = line; + g_device_error->thread_id = blockIdx.x * blockDim.x + threadIdx.x; + g_device_error->exception = exc; - copy_to_buf(g_device_error.file, file, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); - copy_to_buf(g_device_error.expression, expression, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); + copy_to_buf(g_device_error->file, file, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); + copy_to_buf(g_device_error->expression, expression, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); __threadfence_system(); printf("Device error on thread %d at %s:%d: %s\n", @@ -154,68 +158,111 @@ class device_error_context { public: - // Clears the global state - // The error context can be reused with multiple kernels if this is called - void reset() + // Allocates the managed error struct if it does not already exist, + // then clears the error state. After cudaDeviceReset() the __managed__ + // pointer is back to nullptr, so the next construction re-allocates. + device_error_context() { - detail::g_device_error.flag = 0; - detail::g_device_error.line = 0; - detail::g_device_error.thread_id = 0; - detail::g_device_error.exception = detail::exception_type::unknown; - detail::g_device_error.file[0] = '\0'; - detail::g_device_error.expression[0] = '\0'; + ensure_allocated(); + reset(); } - // On construction, reset the global error state to ensure we have a good start - device_error_context() + // Free the managed allocation during normal (non-error) shutdown. + // After cudaDeviceReset(), g_device_error is already nullptr. + ~device_error_context() { - reset(); + if (detail::g_device_error != nullptr) + { + cudaFree(detail::g_device_error); + detail::g_device_error = nullptr; + } + } + + device_error_context(const device_error_context&) = delete; + device_error_context& operator=(const device_error_context&) = delete; + + // Clears the error fields so the context can be reused across kernel launches + void reset() + { + if (detail::g_device_error == nullptr) + { + ensure_allocated(); + } + + detail::g_device_error->flag = 0; + detail::g_device_error->line = 0; + detail::g_device_error->thread_id = 0; + detail::g_device_error->exception = detail::exception_type::unknown; + detail::g_device_error->file[0] = '\0'; + detail::g_device_error->expression[0] = '\0'; } - // Allows the user to synchronize and check for errors as is typical of CUDA - // This allows an extra step in that it will throw on the host - // Much like cudaGetLastError, the call to synchronize will destroy the information in the global context - // This allows trivial reuse of all these facilities + // Allows the user to synchronize and check for errors as is typical of CUDA. + // This allows an extra step in that it will throw on the host. + // + // When a device error is detected (flag != 0): + // 1. The error info is copied to local variables + // 2. The managed allocation is freed and the device is reset + // (required because __trap() corrupts the device context) + // 3. The appropriate std::exception is thrown + // + // After catching the exception, a new device_error_context can be + // constructed which will re-allocate fresh managed memory. void synchronize() { const auto status = cudaDeviceSynchronize(); + if (detail::g_device_error == nullptr) + { + if (status != cudaSuccess) + { + cudaGetLastError(); + BOOST_THROW_EXCEPTION(std::runtime_error(cudaGetErrorString(status))); + } + return; + } + // Read directly from managed memory — no cudaMemcpyFromSymbol needed // This works even after __trap() corrupts the device context - const auto flag = detail::g_device_error.flag; - const auto thread_id = detail::g_device_error.thread_id; - const auto line = detail::g_device_error.line; + const auto flag = detail::g_device_error->flag; if (flag != 0) { + // Copy everything we need to local storage before freeing + const auto thread_id = detail::g_device_error->thread_id; + const auto line = detail::g_device_error->line; + const auto exc = detail::g_device_error->exception; + std::ostringstream oss; oss << "Device error on thread " << thread_id - << " at " << detail::g_device_error.file + << " at " << detail::g_device_error->file << ":" << line - << ": " << detail::g_device_error.expression; + << ": " << detail::g_device_error->expression; - // Read exception type before reset clears it - const auto exc = detail::g_device_error.exception; + const auto msg = oss.str(); - // Clear the sticky CUDA error and reset our state - cudaGetLastError(); - reset(); + // Free the managed allocation and reset the device so that + // new kernels can be launched after the user catches the exception. + // cudaDeviceReset() re-initializes the __managed__ pointer to nullptr. + cudaFree(detail::g_device_error); + detail::g_device_error = nullptr; + cudaDeviceReset(); switch (exc) { case detail::exception_type::domain_error: - BOOST_THROW_EXCEPTION(std::domain_error(oss.str())); + BOOST_THROW_EXCEPTION(std::domain_error(msg)); break; case detail::exception_type::overflow: - BOOST_THROW_EXCEPTION(std::overflow_error(oss.str())); + BOOST_THROW_EXCEPTION(std::overflow_error(msg)); break; case detail::exception_type::underflow: - BOOST_THROW_EXCEPTION(std::underflow_error(oss.str())); + BOOST_THROW_EXCEPTION(std::underflow_error(msg)); break; case detail::exception_type::unknown: [[fallthrough]]; default: - BOOST_THROW_EXCEPTION(std::runtime_error(oss.str())); + BOOST_THROW_EXCEPTION(std::runtime_error(msg)); } } @@ -227,6 +274,22 @@ class device_error_context BOOST_THROW_EXCEPTION(std::runtime_error(cudaGetErrorString(status))); } } + +private: + + void ensure_allocated() + { + if (detail::g_device_error == nullptr) + { + const auto err = cudaMallocManaged(&detail::g_device_error, sizeof(detail::cuda_device_error)); + if (err != cudaSuccess) + { + BOOST_THROW_EXCEPTION(std::runtime_error( + std::string("Failed to allocate device error context: ") + cudaGetErrorString(err))); + } + cudaDeviceSynchronize(); + } + } }; #endif // __CUDACC__ From 4504f3e4f9482f03dc9c5f8444f6ee4fc8ad7500 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 12:00:42 -0400 Subject: [PATCH 21/36] Realloc after thrown exception --- examples/cuda_error_handling.cu | 54 +++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu index 08f6eab..903f8b4 100644 --- a/examples/cuda_error_handling.cu +++ b/examples/cuda_error_handling.cu @@ -7,6 +7,13 @@ // operation overflows on the GPU, the error is captured in managed // memory and rethrown with BOOST_THROW_EXCEPTION on the host when // you call ctx.synchronize(). +// +// The device_error_context manages a dynamically allocated managed +// memory buffer. When an error is detected, synchronize() copies the +// error info to host locals, frees the managed buffer, resets the +// device, and throws. After catching the exception, the same context +// can be reused — the next call to synchronize() automatically +// re-allocates fresh managed memory via reset(). #include #include @@ -43,29 +50,27 @@ __global__ void safe_kernel(const test_type* in, test_type* out, int n) int main() { - test_type* data = nullptr; - test_type* result = nullptr; - - cudaMallocManaged(&data, 4 * sizeof(test_type)); - cudaMallocManaged(&result, 4 * sizeof(test_type)); - cudaDeviceSynchronize(); + // Create a single device_error_context for the lifetime of the program. + // The constructor allocates managed memory for error reporting and + // clears any stale state. + boost::safe_numbers::device_error_context ctx; // --------------------------------------------------------------- - // Step 1: Demonstrate catching a device-side overflow + // Step 1: Launch a kernel that overflows and catch the error // --------------------------------------------------------------- - // Create a device_error_context. The constructor clears any - // stale error state from previous kernel launches. - boost::safe_numbers::device_error_context ctx; + test_type* result = nullptr; + cudaMallocManaged(&result, sizeof(test_type)); + cudaDeviceSynchronize(); std::cout << "=== Launching kernel that overflows ===" << std::endl; overflow_kernel<<<1, 1>>>(result); - // synchronize() does three things: - // 1. Calls cudaDeviceSynchronize() to wait for the kernel - // 2. Reads the error state from managed memory - // 3. Throws the appropriate std::exception if an error was captured + // synchronize() waits for the kernel, reads the error state, + // and throws the appropriate std::exception if one was captured. + // On error it also calls cudaDeviceReset() internally, so the + // device is ready for fresh work after catching the exception. try { ctx.synchronize(); @@ -77,18 +82,28 @@ int main() } // --------------------------------------------------------------- - // Step 2: After catching the error, the context is automatically - // reset. You can reuse it for the next kernel launch. + // Step 2: After catching the error, the same ctx can be reused. + // The next synchronize() call automatically re-allocates + // managed memory for error reporting. + // Note: cudaDeviceReset() freed all prior allocations, + // so we must re-allocate our data buffers too. // --------------------------------------------------------------- std::cout << "\n=== Launching kernel with valid arithmetic ===" << std::endl; + test_type* data = nullptr; + test_type* out = nullptr; + + cudaMallocManaged(&data, 4 * sizeof(test_type)); + cudaMallocManaged(&out, 4 * sizeof(test_type)); + cudaDeviceSynchronize(); + data[0] = test_type{10}; data[1] = test_type{20}; data[2] = test_type{30}; data[3] = test_type{40}; - safe_kernel<<<1, 4>>>(data, result, 4); + safe_kernel<<<1, 4>>>(data, out, 4); try { @@ -100,11 +115,10 @@ int main() std::cout << "Unexpected error: " << e.what() << std::endl; } - // Verify results for (int i = 0; i < 4; ++i) { std::cout << "result[" << i << "] = " - << static_cast(result[i]) << std::endl; + << static_cast(out[i]) << std::endl; } // --------------------------------------------------------------- @@ -112,7 +126,7 @@ int main() // --------------------------------------------------------------- cudaFree(data); - cudaFree(result); + cudaFree(out); cudaDeviceReset(); return 0; From a4582c7d1303aa3cc0e73d6578e645c5c48b60ba Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 12:02:57 -0400 Subject: [PATCH 22/36] Enforce allowing only a single error context at any time to avoid race --- .../safe_numbers/detail/cuda_error_reporting.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index 596223e..3a733d1 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -95,6 +95,10 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE inline void copy_to_buf(char* dst, const char* sr // dereference it directly. __managed__ cuda_device_error* g_device_error = nullptr; +// Tracks whether a device_error_context instance is alive. +// Only one may exist at a time to prevent races on g_device_error. +inline bool g_device_error_context_active = false; + __host__ __device__ inline void report_device_error( exception_type exc, const char* file, @@ -161,8 +165,15 @@ class device_error_context // Allocates the managed error struct if it does not already exist, // then clears the error state. After cudaDeviceReset() the __managed__ // pointer is back to nullptr, so the next construction re-allocates. + // Only one device_error_context may exist at a time. device_error_context() { + if (detail::g_device_error_context_active) + { + BOOST_THROW_EXCEPTION(std::logic_error( + "Only one device_error_context may exist at a time")); + } + detail::g_device_error_context_active = true; ensure_allocated(); reset(); } @@ -176,6 +187,7 @@ class device_error_context cudaFree(detail::g_device_error); detail::g_device_error = nullptr; } + detail::g_device_error_context_active = false; } device_error_context(const device_error_context&) = delete; From d507b6a48fc38f1c18d2e936b4044a257138aef8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 12:39:49 -0400 Subject: [PATCH 23/36] Use local allocation instead of always global --- .../detail/cuda_error_reporting.hpp | 67 +++++++++++-------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index 3a733d1..c970694 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -160,11 +160,17 @@ __host__ __device__ inline void report_device_error( class device_error_context { + // Host-side mirror of the managed allocation pointer. + // All host-side reads go through this so we never touch the + // __managed__ variable g_device_error after cudaDeviceReset(). + detail::cuda_device_error* m_allocation {nullptr}; + public: // Allocates the managed error struct if it does not already exist, // then clears the error state. After cudaDeviceReset() the __managed__ - // pointer is back to nullptr, so the next construction re-allocates. + // pointer is back to nullptr, so the next call to reset() or + // synchronize() re-allocates automatically. // Only one device_error_context may exist at a time. device_error_context() { @@ -179,13 +185,14 @@ class device_error_context } // Free the managed allocation during normal (non-error) shutdown. - // After cudaDeviceReset(), g_device_error is already nullptr. + // Uses the host-side m_allocation pointer so this is safe even + // after cudaDeviceReset() has invalidated the __managed__ global. ~device_error_context() { - if (detail::g_device_error != nullptr) + if (m_allocation != nullptr) { - cudaFree(detail::g_device_error); - detail::g_device_error = nullptr; + cudaFree(m_allocation); + m_allocation = nullptr; } detail::g_device_error_context_active = false; } @@ -193,20 +200,21 @@ class device_error_context device_error_context(const device_error_context&) = delete; device_error_context& operator=(const device_error_context&) = delete; - // Clears the error fields so the context can be reused across kernel launches + // Clears the error fields so the context can be reused across kernel launches. + // If the managed buffer was freed (e.g. after a device reset), re-allocates it. void reset() { - if (detail::g_device_error == nullptr) + if (m_allocation == nullptr) { ensure_allocated(); } - detail::g_device_error->flag = 0; - detail::g_device_error->line = 0; - detail::g_device_error->thread_id = 0; - detail::g_device_error->exception = detail::exception_type::unknown; - detail::g_device_error->file[0] = '\0'; - detail::g_device_error->expression[0] = '\0'; + m_allocation->flag = 0; + m_allocation->line = 0; + m_allocation->thread_id = 0; + m_allocation->exception = detail::exception_type::unknown; + m_allocation->file[0] = '\0'; + m_allocation->expression[0] = '\0'; } // Allows the user to synchronize and check for errors as is typical of CUDA. @@ -218,13 +226,13 @@ class device_error_context // (required because __trap() corrupts the device context) // 3. The appropriate std::exception is thrown // - // After catching the exception, a new device_error_context can be - // constructed which will re-allocate fresh managed memory. + // After catching the exception, the same context can be reused — + // the next call to synchronize() automatically re-allocates. void synchronize() { const auto status = cudaDeviceSynchronize(); - if (detail::g_device_error == nullptr) + if (m_allocation == nullptr) { if (status != cudaSuccess) { @@ -234,30 +242,31 @@ class device_error_context return; } - // Read directly from managed memory — no cudaMemcpyFromSymbol needed + // Read directly from managed memory via host-side pointer // This works even after __trap() corrupts the device context - const auto flag = detail::g_device_error->flag; + const auto flag = m_allocation->flag; if (flag != 0) { // Copy everything we need to local storage before freeing - const auto thread_id = detail::g_device_error->thread_id; - const auto line = detail::g_device_error->line; - const auto exc = detail::g_device_error->exception; + const auto thread_id = m_allocation->thread_id; + const auto line = m_allocation->line; + const auto exc = m_allocation->exception; std::ostringstream oss; oss << "Device error on thread " << thread_id - << " at " << detail::g_device_error->file + << " at " << m_allocation->file << ":" << line - << ": " << detail::g_device_error->expression; + << ": " << m_allocation->expression; const auto msg = oss.str(); // Free the managed allocation and reset the device so that // new kernels can be launched after the user catches the exception. - // cudaDeviceReset() re-initializes the __managed__ pointer to nullptr. - cudaFree(detail::g_device_error); - detail::g_device_error = nullptr; + // cudaDeviceReset() re-initializes the __managed__ pointer to nullptr, + // but we track our own state via m_allocation. + cudaFree(m_allocation); + m_allocation = nullptr; cudaDeviceReset(); switch (exc) @@ -291,14 +300,16 @@ class device_error_context void ensure_allocated() { - if (detail::g_device_error == nullptr) + if (m_allocation == nullptr) { - const auto err = cudaMallocManaged(&detail::g_device_error, sizeof(detail::cuda_device_error)); + const auto err = cudaMallocManaged(&m_allocation, sizeof(detail::cuda_device_error)); if (err != cudaSuccess) { BOOST_THROW_EXCEPTION(std::runtime_error( std::string("Failed to allocate device error context: ") + cudaGetErrorString(err))); } + // Point the __managed__ global at the new allocation so device code can find it + detail::g_device_error = m_allocation; cudaDeviceSynchronize(); } } From 56832a0641d4b5d3a555dc25ec1ab2d5a27aaf2c Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 12:55:40 -0400 Subject: [PATCH 24/36] Move from managed to device memory --- .../detail/cuda_error_reporting.hpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index c970694..8f20fca 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -87,13 +87,15 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE inline void copy_to_buf(char* dst, const char* sr #ifdef __CUDACC__ -// __managed__ pointer to dynamically allocated managed memory. -// Using a pointer (rather than a __managed__ struct) lets us free and -// re-allocate after cudaDeviceReset(), which is required to recover -// from __trap() corrupting the device context. -// The pointer itself lives in managed memory so device code can -// dereference it directly. -__managed__ cuda_device_error* g_device_error = nullptr; +// __device__ pointer to dynamically allocated managed memory. +// Using __device__ (not __managed__) means the pointer variable itself +// lives in device memory, not unified memory. This is critical because +// after cudaDeviceReset() a __managed__ variable's backing memory is +// freed and any host-side access segfaults. A __device__ variable is +// re-initialized to its static initializer (nullptr) when the runtime +// restarts, and the host never dereferences it directly — it uses +// cudaMemcpyToSymbol to update it. +__device__ cuda_device_error* g_device_error = nullptr; // Tracks whether a device_error_context instance is alive. // Only one may exist at a time to prevent races on g_device_error. @@ -308,8 +310,10 @@ class device_error_context BOOST_THROW_EXCEPTION(std::runtime_error( std::string("Failed to allocate device error context: ") + cudaGetErrorString(err))); } - // Point the __managed__ global at the new allocation so device code can find it - detail::g_device_error = m_allocation; + + // Point the __device__ global at the new allocation so device code can find it. + // We must use cudaMemcpyToSymbol because g_device_error is in device memory. + cudaMemcpyToSymbol(detail::g_device_error, &m_allocation, sizeof(detail::cuda_device_error*)); cudaDeviceSynchronize(); } } From 8ef83976a00f06692caa8444923a71790ea4d0b5 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 13:00:36 -0400 Subject: [PATCH 25/36] Print debugging --- examples/cuda_error_handling.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu index 903f8b4..0b37f8d 100644 --- a/examples/cuda_error_handling.cu +++ b/examples/cuda_error_handling.cu @@ -94,15 +94,21 @@ int main() test_type* data = nullptr; test_type* out = nullptr; + std::cout << "Reallocating Memory" << std::endl; + cudaMallocManaged(&data, 4 * sizeof(test_type)); cudaMallocManaged(&out, 4 * sizeof(test_type)); cudaDeviceSynchronize(); + std::cout << "Writing data" << std::endl; + data[0] = test_type{10}; data[1] = test_type{20}; data[2] = test_type{30}; data[3] = test_type{40}; + std::cout << "Launching kernel" << std::endl; + safe_kernel<<<1, 4>>>(data, out, 4); try From 03fe724486271dcb84e27610c236152068ed60d8 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 13:03:42 -0400 Subject: [PATCH 26/36] Change failure deallocation --- .../safe_numbers/detail/cuda_error_reporting.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index 8f20fca..9c5691f 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -263,11 +263,12 @@ class device_error_context const auto msg = oss.str(); - // Free the managed allocation and reset the device so that - // new kernels can be launched after the user catches the exception. - // cudaDeviceReset() re-initializes the __managed__ pointer to nullptr, - // but we track our own state via m_allocation. - cudaFree(m_allocation); + // Reset the device so that new kernels can be launched after + // the user catches the exception. cudaDeviceReset() frees all + // device and managed allocations (including what m_allocation + // points to), so we do NOT call cudaFree first — the device + // context is corrupted by __trap() and cudaFree may hang or + // fail in that state. m_allocation = nullptr; cudaDeviceReset(); From ea775286b0a9c8619609f8248e600a240cc9587f Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 13:32:38 -0400 Subject: [PATCH 27/36] Change recovery path --- include/boost/safe_numbers/detail/cuda_error_reporting.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index 9c5691f..c50b2bc 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -271,6 +271,8 @@ class device_error_context // fail in that state. m_allocation = nullptr; cudaDeviceReset(); + cudaGetLastError(); // Clear the sticky error left by __trap() + reset(); switch (exc) { From f1277145906032372783cd25901bdfbca3f0510d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 13:55:28 -0400 Subject: [PATCH 28/36] Add a separate reset after error function --- examples/cuda_error_handling.cu | 11 +++-- .../detail/cuda_error_reporting.hpp | 42 +++++++------------ 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu index 0b37f8d..1eb3508 100644 --- a/examples/cuda_error_handling.cu +++ b/examples/cuda_error_handling.cu @@ -79,14 +79,17 @@ int main() catch (const std::overflow_error& e) { std::cout << "Caught overflow_error: " << e.what() << std::endl; + + // Recover from the device error: resets the device, clears the + // sticky CUDA error, and re-allocates the error reporting buffer. + ctx.reset_after_error(); } // --------------------------------------------------------------- // Step 2: After catching the error, the same ctx can be reused. - // The next synchronize() call automatically re-allocates - // managed memory for error reporting. - // Note: cudaDeviceReset() freed all prior allocations, - // so we must re-allocate our data buffers too. + // reset_after_error() restored the device and error context, + // but cudaDeviceReset() freed all prior allocations, so we + // must re-allocate our data buffers. // --------------------------------------------------------------- std::cout << "\n=== Launching kernel with valid arithmetic ===" << std::endl; diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index c50b2bc..dc5edea 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -224,26 +224,14 @@ class device_error_context // // When a device error is detected (flag != 0): // 1. The error info is copied to local variables - // 2. The managed allocation is freed and the device is reset - // (required because __trap() corrupts the device context) - // 3. The appropriate std::exception is thrown + // 2. The appropriate std::exception is thrown // - // After catching the exception, the same context can be reused — - // the next call to synchronize() automatically re-allocates. + // After catching the exception, call reset_after_error() to restore + // the device and error context before launching new kernels. void synchronize() { const auto status = cudaDeviceSynchronize(); - if (m_allocation == nullptr) - { - if (status != cudaSuccess) - { - cudaGetLastError(); - BOOST_THROW_EXCEPTION(std::runtime_error(cudaGetErrorString(status))); - } - return; - } - // Read directly from managed memory via host-side pointer // This works even after __trap() corrupts the device context const auto flag = m_allocation->flag; @@ -263,17 +251,6 @@ class device_error_context const auto msg = oss.str(); - // Reset the device so that new kernels can be launched after - // the user catches the exception. cudaDeviceReset() frees all - // device and managed allocations (including what m_allocation - // points to), so we do NOT call cudaFree first — the device - // context is corrupted by __trap() and cudaFree may hang or - // fail in that state. - m_allocation = nullptr; - cudaDeviceReset(); - cudaGetLastError(); // Clear the sticky error left by __trap() - reset(); - switch (exc) { case detail::exception_type::domain_error: @@ -301,6 +278,19 @@ class device_error_context } } + // Call this in a catch block after synchronize() throws a device error. + // Resets the device (required because __trap() corrupts the device context), + // clears the sticky CUDA error, re-allocates the managed error buffer, + // and re-initializes the device global so new kernels can report errors. + // After this returns, the context is fully ready for new kernel launches. + void reset_after_error() + { + m_allocation = nullptr; + cudaDeviceReset(); + cudaGetLastError(); + reset(); + } + private: void ensure_allocated() From f81d621e1e23944937e14bebeee69022acd71b36 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 14:00:46 -0400 Subject: [PATCH 29/36] Try reset the runtime --- .../boost/safe_numbers/detail/cuda_error_reporting.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index dc5edea..63791a5 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -288,6 +288,14 @@ class device_error_context m_allocation = nullptr; cudaDeviceReset(); cudaGetLastError(); + + // cudaDeviceReset() destroyed the CUDA context. Force the runtime + // to create a fresh one by re-selecting the current device before + // any allocations. + int dev {0}; + cudaGetDevice(&dev); + cudaSetDevice(dev); + reset(); } From be61bd75ecbb099f08ccd40035e7399d1cdc55a6 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 14:11:40 -0400 Subject: [PATCH 30/36] Reorder operations --- .../safe_numbers/detail/cuda_error_reporting.hpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index 63791a5..181b408 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -286,15 +286,13 @@ class device_error_context void reset_after_error() { m_allocation = nullptr; - cudaDeviceReset(); - cudaGetLastError(); - // cudaDeviceReset() destroyed the CUDA context. Force the runtime - // to create a fresh one by re-selecting the current device before - // any allocations. - int dev {0}; - cudaGetDevice(&dev); - cudaSetDevice(dev); + // The sticky error from __trap() must be drained BEFORE + // cudaDeviceReset(), otherwise the reset call itself fails + // silently (all CUDA runtime calls return the sticky error + // until cudaGetLastError() clears it). + cudaGetLastError(); + cudaDeviceReset(); reset(); } From 28689028b8758825fffd97ccd36429044ec35dce Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 14:17:56 -0400 Subject: [PATCH 31/36] Do a harder reset --- .../safe_numbers/detail/cuda_error_reporting.hpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index 181b408..56f381f 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -16,6 +16,7 @@ #ifdef __CUDACC__ #include +#include #endif #endif // BOOST_SAFE_NUMBERS_BUILD_MODULE @@ -287,12 +288,15 @@ class device_error_context { m_allocation = nullptr; - // The sticky error from __trap() must be drained BEFORE - // cudaDeviceReset(), otherwise the reset call itself fails - // silently (all CUDA runtime calls return the sticky error - // until cudaGetLastError() clears it). - cudaGetLastError(); - cudaDeviceReset(); + // __trap() creates a sticky error that corrupts the CUDA context. + // cudaDeviceReset() alone cannot recover from this in the same + // process. We must use the driver API to fully reset the primary + // context, then re-initialize the runtime on a fresh context. + // See: https://stackoverflow.com/questions/43659314/how-can-i-reset-the-cuda-error-to-success-with-driver-api-after-a-trap-instructi + int dev {0}; + cudaGetDevice(&dev); + cuDevicePrimaryCtxReset(dev); + cudaSetDevice(dev); reset(); } From b135e512051e412d8110c7234dff9da634fc6a38 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 14:21:13 -0400 Subject: [PATCH 32/36] Need to link to driver to manipulate context --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 94b85a9..d2c4eb3 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ if(HAVE_BOOST_TEST) enable_testing() - boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::safe_numbers Boost::random CUDA::cudart COMPILE_DEFINITIONS BOOST_SAFE_NUMBERS_ENABLE_CUDA=1) + boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::safe_numbers Boost::random CUDA::cudart CUDA::cuda_driver COMPILE_DEFINITIONS BOOST_SAFE_NUMBERS_ENABLE_CUDA=1) else() From 732fe8f70ee06dead2c97a6fc201eba3ddb69c9a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 14:44:33 -0400 Subject: [PATCH 33/36] Remove __trap as it causes an unrecoverable error for the process --- examples/cuda_error_handling.cu | 25 +-- .../detail/cuda_error_reporting.hpp | 158 +++++------------- test/CMakeLists.txt | 2 +- 3 files changed, 48 insertions(+), 137 deletions(-) diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu index 1eb3508..9f1c530 100644 --- a/examples/cuda_error_handling.cu +++ b/examples/cuda_error_handling.cu @@ -9,11 +9,9 @@ // you call ctx.synchronize(). // // The device_error_context manages a dynamically allocated managed -// memory buffer. When an error is detected, synchronize() copies the -// error info to host locals, frees the managed buffer, resets the -// device, and throws. After catching the exception, the same context -// can be reused — the next call to synchronize() automatically -// re-allocates fresh managed memory via reset(). +// memory buffer. When an error is detected, synchronize() clears the +// error state and throws. After catching the exception, the same +// context can be reused immediately for new kernel launches. #include #include @@ -69,8 +67,8 @@ int main() // synchronize() waits for the kernel, reads the error state, // and throws the appropriate std::exception if one was captured. - // On error it also calls cudaDeviceReset() internally, so the - // device is ready for fresh work after catching the exception. + // On error it clears the error state before throwing, so the + // context is immediately reusable after catching the exception. try { ctx.synchronize(); @@ -79,17 +77,12 @@ int main() catch (const std::overflow_error& e) { std::cout << "Caught overflow_error: " << e.what() << std::endl; - - // Recover from the device error: resets the device, clears the - // sticky CUDA error, and re-allocates the error reporting buffer. - ctx.reset_after_error(); } // --------------------------------------------------------------- - // Step 2: After catching the error, the same ctx can be reused. - // reset_after_error() restored the device and error context, - // but cudaDeviceReset() freed all prior allocations, so we - // must re-allocate our data buffers. + // Step 2: After catching the error, the same ctx can be reused + // immediately. synchronize() already cleared the error + // state before throwing, so no recovery step is needed. // --------------------------------------------------------------- std::cout << "\n=== Launching kernel with valid arithmetic ===" << std::endl; @@ -134,9 +127,9 @@ int main() // Cleanup // --------------------------------------------------------------- + cudaFree(result); cudaFree(data); cudaFree(out); - cudaDeviceReset(); return 0; } diff --git a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp index 56f381f..669fe36 100644 --- a/include/boost/safe_numbers/detail/cuda_error_reporting.hpp +++ b/include/boost/safe_numbers/detail/cuda_error_reporting.hpp @@ -16,7 +16,6 @@ #ifdef __CUDACC__ #include -#include #endif #endif // BOOST_SAFE_NUMBERS_BUILD_MODULE @@ -88,15 +87,9 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE inline void copy_to_buf(char* dst, const char* sr #ifdef __CUDACC__ -// __device__ pointer to dynamically allocated managed memory. -// Using __device__ (not __managed__) means the pointer variable itself -// lives in device memory, not unified memory. This is critical because -// after cudaDeviceReset() a __managed__ variable's backing memory is -// freed and any host-side access segfaults. A __device__ variable is -// re-initialized to its static initializer (nullptr) when the runtime -// restarts, and the host never dereferences it directly — it uses -// cudaMemcpyToSymbol to update it. -__device__ cuda_device_error* g_device_error = nullptr; +// Managed memory error struct accessible from both host and device. +// Since we never destroy the CUDA context, __managed__ is safe to use. +__managed__ cuda_device_error g_device_error {}; // Tracks whether a device_error_context instance is alive. // Only one may exist at a time to prevent races on g_device_error. @@ -110,28 +103,22 @@ __host__ __device__ inline void report_device_error( { #ifdef __CUDA_ARCH__ - if (g_device_error != nullptr && atomicCAS(&g_device_error->flag, 0, 1) == 0) + if (atomicCAS(&g_device_error.flag, 0, 1) == 0) { - g_device_error->line = line; - g_device_error->thread_id = blockIdx.x * blockDim.x + threadIdx.x; - g_device_error->exception = exc; + g_device_error.line = line; + g_device_error.thread_id = blockIdx.x * blockDim.x + threadIdx.x; + g_device_error.exception = exc; - copy_to_buf(g_device_error->file, file, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); - copy_to_buf(g_device_error->expression, expression, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); + copy_to_buf(g_device_error.file, file, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); + copy_to_buf(g_device_error.expression, expression, BOOST_SAFE_NUMBERS_DEVICE_ERROR_BUFFER_SIZE); __threadfence_system(); - - printf("Device error on thread %d at %s:%d: %s\n", - blockIdx.x * blockDim.x + threadIdx.x, - file, line, expression); - - __trap(); } - // Other threads: spin until the trap terminates the kernel - while (true) - { - __nanosleep(1000000); - } + // Return instead of calling __trap(). This allows the kernel to + // complete normally without corrupting the CUDA context. Other + // threads may continue with incorrect values, but synchronize() + // will detect the error via the flag and throw on the host. + return; #else const auto msg = std::string(file) + ":" + std::to_string(line) + ": " + expression; @@ -163,18 +150,9 @@ __host__ __device__ inline void report_device_error( class device_error_context { - // Host-side mirror of the managed allocation pointer. - // All host-side reads go through this so we never touch the - // __managed__ variable g_device_error after cudaDeviceReset(). - detail::cuda_device_error* m_allocation {nullptr}; - public: - // Allocates the managed error struct if it does not already exist, - // then clears the error state. After cudaDeviceReset() the __managed__ - // pointer is back to nullptr, so the next call to reset() or - // synchronize() re-allocates automatically. - // Only one device_error_context may exist at a time. + // Clears the error state. Only one device_error_context may exist at a time. device_error_context() { if (detail::g_device_error_context_active) @@ -183,20 +161,11 @@ class device_error_context "Only one device_error_context may exist at a time")); } detail::g_device_error_context_active = true; - ensure_allocated(); reset(); } - // Free the managed allocation during normal (non-error) shutdown. - // Uses the host-side m_allocation pointer so this is safe even - // after cudaDeviceReset() has invalidated the __managed__ global. ~device_error_context() { - if (m_allocation != nullptr) - { - cudaFree(m_allocation); - m_allocation = nullptr; - } detail::g_device_error_context_active = false; } @@ -204,54 +173,43 @@ class device_error_context device_error_context& operator=(const device_error_context&) = delete; // Clears the error fields so the context can be reused across kernel launches. - // If the managed buffer was freed (e.g. after a device reset), re-allocates it. void reset() { - if (m_allocation == nullptr) - { - ensure_allocated(); - } - - m_allocation->flag = 0; - m_allocation->line = 0; - m_allocation->thread_id = 0; - m_allocation->exception = detail::exception_type::unknown; - m_allocation->file[0] = '\0'; - m_allocation->expression[0] = '\0'; + detail::g_device_error.flag = 0; + detail::g_device_error.line = 0; + detail::g_device_error.thread_id = 0; + detail::g_device_error.exception = detail::exception_type::unknown; + detail::g_device_error.file[0] = '\0'; + detail::g_device_error.expression[0] = '\0'; } - // Allows the user to synchronize and check for errors as is typical of CUDA. - // This allows an extra step in that it will throw on the host. - // - // When a device error is detected (flag != 0): - // 1. The error info is copied to local variables - // 2. The appropriate std::exception is thrown - // - // After catching the exception, call reset_after_error() to restore - // the device and error context before launching new kernels. + // Synchronizes the device and checks for errors captured by device code. + // If an error was detected, the error state is cleared (so the context + // is immediately reusable), and the appropriate std::exception is thrown. void synchronize() { const auto status = cudaDeviceSynchronize(); - // Read directly from managed memory via host-side pointer - // This works even after __trap() corrupts the device context - const auto flag = m_allocation->flag; + const auto flag = detail::g_device_error.flag; if (flag != 0) { - // Copy everything we need to local storage before freeing - const auto thread_id = m_allocation->thread_id; - const auto line = m_allocation->line; - const auto exc = m_allocation->exception; + const auto thread_id = detail::g_device_error.thread_id; + const auto line = detail::g_device_error.line; + const auto exc = detail::g_device_error.exception; std::ostringstream oss; oss << "Device error on thread " << thread_id - << " at " << m_allocation->file + << " at " << detail::g_device_error.file << ":" << line - << ": " << m_allocation->expression; + << ": " << detail::g_device_error.expression; const auto msg = oss.str(); + // Clear the error state so the context can be reused + // immediately after catching the exception. + reset(); + switch (exc) { case detail::exception_type::domain_error: @@ -269,8 +227,10 @@ class device_error_context BOOST_THROW_EXCEPTION(std::runtime_error(msg)); } } - - reset(); + else + { + reset(); + } if (status != cudaSuccess) { @@ -278,48 +238,6 @@ class device_error_context BOOST_THROW_EXCEPTION(std::runtime_error(cudaGetErrorString(status))); } } - - // Call this in a catch block after synchronize() throws a device error. - // Resets the device (required because __trap() corrupts the device context), - // clears the sticky CUDA error, re-allocates the managed error buffer, - // and re-initializes the device global so new kernels can report errors. - // After this returns, the context is fully ready for new kernel launches. - void reset_after_error() - { - m_allocation = nullptr; - - // __trap() creates a sticky error that corrupts the CUDA context. - // cudaDeviceReset() alone cannot recover from this in the same - // process. We must use the driver API to fully reset the primary - // context, then re-initialize the runtime on a fresh context. - // See: https://stackoverflow.com/questions/43659314/how-can-i-reset-the-cuda-error-to-success-with-driver-api-after-a-trap-instructi - int dev {0}; - cudaGetDevice(&dev); - cuDevicePrimaryCtxReset(dev); - cudaSetDevice(dev); - - reset(); - } - -private: - - void ensure_allocated() - { - if (m_allocation == nullptr) - { - const auto err = cudaMallocManaged(&m_allocation, sizeof(detail::cuda_device_error)); - if (err != cudaSuccess) - { - BOOST_THROW_EXCEPTION(std::runtime_error( - std::string("Failed to allocate device error context: ") + cudaGetErrorString(err))); - } - - // Point the __device__ global at the new allocation so device code can find it. - // We must use cudaMemcpyToSymbol because g_device_error is in device memory. - cudaMemcpyToSymbol(detail::g_device_error, &m_allocation, sizeof(detail::cuda_device_error*)); - cudaDeviceSynchronize(); - } - } }; #endif // __CUDACC__ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d2c4eb3..94b85a9 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ if(HAVE_BOOST_TEST) enable_testing() - boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::safe_numbers Boost::random CUDA::cudart CUDA::cuda_driver COMPILE_DEFINITIONS BOOST_SAFE_NUMBERS_ENABLE_CUDA=1) + boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::safe_numbers Boost::random CUDA::cudart COMPILE_DEFINITIONS BOOST_SAFE_NUMBERS_ENABLE_CUDA=1) else() From a55791e15b37a796993447dbfa68189b77ed53ca Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 26 Mar 2026 15:15:38 -0400 Subject: [PATCH 34/36] Remove now unneeded print statements --- examples/cuda_error_handling.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/cuda_error_handling.cu b/examples/cuda_error_handling.cu index 9f1c530..2e9efff 100644 --- a/examples/cuda_error_handling.cu +++ b/examples/cuda_error_handling.cu @@ -90,21 +90,15 @@ int main() test_type* data = nullptr; test_type* out = nullptr; - std::cout << "Reallocating Memory" << std::endl; - cudaMallocManaged(&data, 4 * sizeof(test_type)); cudaMallocManaged(&out, 4 * sizeof(test_type)); cudaDeviceSynchronize(); - std::cout << "Writing data" << std::endl; - data[0] = test_type{10}; data[1] = test_type{20}; data[2] = test_type{30}; data[3] = test_type{40}; - std::cout << "Launching kernel" << std::endl; - safe_kernel<<<1, 4>>>(data, out, 4); try From 46a04da326b03b61b80677fa7affca25dfa9579a Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 27 Mar 2026 11:34:06 -0400 Subject: [PATCH 35/36] Implement byteswap since only NVCC 13+ has it --- include/boost/safe_numbers/bit.hpp | 35 +++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/include/boost/safe_numbers/bit.hpp b/include/boost/safe_numbers/bit.hpp index 640f1ff..35e6c33 100644 --- a/include/boost/safe_numbers/bit.hpp +++ b/include/boost/safe_numbers/bit.hpp @@ -1,3 +1,4 @@ +// Copyright 2020 Peter Dimov // Copyright 2026 Matt Borland // Distributed under the Boost Software License, Version 1.0. // https://www.boost.org/LICENSE_1_0.txt @@ -292,6 +293,38 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto popcount(const Unsig #endif } +// NVCC 12 does not have byteswap builtin, only 13+ +#if (defined(BOOST_SAFE_NUMBERS_ENABLE_CUDA) && defined(__CUDACC__)) + +namespace detail { + +constexpr auto byteswap_impl(const std::uint8_t x) noexcept +{ + return x; +} + +constexpr auto byteswap_impl(const std::uint16_t x) noexcept +{ + return static_cast( x << 8 | x >> 8 ); +} + +constexpr auto byteswap_impl(const std::uint32_t x) noexcept +{ + const auto step16 = x << 16 | x >> 16; + return ((step16 << 8) & 0xff00ff00) | ((step16 >> 8) & 0x00ff00ff); +} + +constexpr auto byteswap_impl(const std::uint64_t x) noexcept +{ + const auto step32 = x << 32 | x >> 32; + const auto step16 = (step32 & 0x0000FFFF0000FFFFULL) << 16 | (step32 & 0xFFFF0000FFFF0000ULL) >> 16; + return (step16 & 0x00FF00FF00FF00FFULL) << 8 | (step16 & 0xFF00FF00FF00FF00ULL) >> 8; +} + +} // namespace detail + +#endif + BOOST_SAFE_NUMBERS_EXPORT template BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(const Int x) noexcept -> Int { @@ -310,7 +343,7 @@ BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(const Int x } else { - return Int{cuda::std::byteswap(static_cast(x))}; + return Int{detail::byteswap_impl(static_cast(x))}; } #endif From df77270b8b45e2a4ca41b1600f5f71275559ae90 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 27 Mar 2026 12:00:47 -0400 Subject: [PATCH 36/36] Add GPU markers to documentation pages where applicable --- doc/modules/ROOT/pages/bit.adoc | 26 ++--- doc/modules/ROOT/pages/byte_conversions.adoc | 12 +-- doc/modules/ROOT/pages/charconv.adoc | 9 ++ doc/modules/ROOT/pages/cuda.adoc | 2 +- doc/modules/ROOT/pages/integer_utilities.adoc | 22 ++--- doc/modules/ROOT/pages/limits.adoc | 36 +++---- doc/modules/ROOT/pages/numeric.adoc | 6 +- doc/modules/ROOT/pages/unsigned_integers.adoc | 94 +++++++++++++------ 8 files changed, 125 insertions(+), 82 deletions(-) diff --git a/doc/modules/ROOT/pages/bit.adoc b/doc/modules/ROOT/pages/bit.adoc index 2bf481a..f321073 100644 --- a/doc/modules/ROOT/pages/bit.adoc +++ b/doc/modules/ROOT/pages/bit.adoc @@ -26,7 +26,7 @@ For `u128`, the functions delegate to the `boost::int128` implementations. [source,c++] ---- template -[[nodiscard]] constexpr auto has_single_bit(UnsignedInt x) noexcept -> bool; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto has_single_bit(UnsignedInt x) noexcept -> bool; ---- Returns `true` if `x` is a power of two. @@ -37,7 +37,7 @@ See https://en.cppreference.com/w/cpp/numeric/has_single_bit.html[`std::has_sing [source,c++] ---- template -[[nodiscard]] constexpr auto bit_ceil(UnsignedInt x) noexcept -> UnsignedInt; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_ceil(UnsignedInt x) noexcept -> UnsignedInt; ---- Returns the smallest power of two not less than `x`. @@ -49,7 +49,7 @@ See https://en.cppreference.com/w/cpp/numeric/bit_ceil.html[`std::bit_ceil`]. [source,c++] ---- template -[[nodiscard]] constexpr auto bit_floor(UnsignedInt x) noexcept -> UnsignedInt; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_floor(UnsignedInt x) noexcept -> UnsignedInt; ---- Returns the largest power of two not greater than `x`. @@ -61,7 +61,7 @@ See https://en.cppreference.com/w/cpp/numeric/bit_floor.html[`std::bit_floor`]. [source,c++] ---- template -[[nodiscard]] constexpr auto bit_width(UnsignedInt x) noexcept -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bit_width(UnsignedInt x) noexcept -> int; ---- Returns the number of bits needed to represent `x` (i.e., 1 + floor(log2(x)) for x > 0, or 0 for x == 0). @@ -74,7 +74,7 @@ See https://en.cppreference.com/w/cpp/numeric/bit_width.html[`std::bit_width`]. [source,c++] ---- template -[[nodiscard]] constexpr auto rotl(UnsignedInt x, int s) noexcept -> UnsignedInt; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotl(UnsignedInt x, int s) noexcept -> UnsignedInt; ---- Computes the result of bitwise left-rotating `x` by `s` positions. @@ -87,7 +87,7 @@ NOTE: `rotl` is not available for `bounded_uint` types. Bit rotation can produce [source,c++] ---- template -[[nodiscard]] constexpr auto rotr(UnsignedInt x, int s) noexcept -> UnsignedInt; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto rotr(UnsignedInt x, int s) noexcept -> UnsignedInt; ---- Computes the result of bitwise right-rotating `x` by `s` positions. @@ -102,7 +102,7 @@ NOTE: `rotr` is not available for `bounded_uint` types. Bit rotation can produce [source,c++] ---- template -[[nodiscard]] constexpr auto countl_zero(UnsignedInt x) noexcept -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_zero(UnsignedInt x) noexcept -> int; ---- Returns the number of consecutive 0-bits starting from the most significant bit. @@ -113,7 +113,7 @@ See https://en.cppreference.com/w/cpp/numeric/countl_zero.html[`std::countl_zero [source,c++] ---- template -[[nodiscard]] constexpr auto countl_one(UnsignedInt x) noexcept -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countl_one(UnsignedInt x) noexcept -> int; ---- Returns the number of consecutive 1-bits starting from the most significant bit. @@ -124,7 +124,7 @@ See https://en.cppreference.com/w/cpp/numeric/countl_one.html[`std::countl_one`] [source,c++] ---- template -[[nodiscard]] constexpr auto countr_zero(UnsignedInt x) noexcept -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_zero(UnsignedInt x) noexcept -> int; ---- Returns the number of consecutive 0-bits starting from the least significant bit. @@ -135,7 +135,7 @@ See https://en.cppreference.com/w/cpp/numeric/countr_zero.html[`std::countr_zero [source,c++] ---- template -[[nodiscard]] constexpr auto countr_one(UnsignedInt x) noexcept -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto countr_one(UnsignedInt x) noexcept -> int; ---- Returns the number of consecutive 1-bits starting from the least significant bit. @@ -146,7 +146,7 @@ See https://en.cppreference.com/w/cpp/numeric/countr_one.html[`std::countr_one`] [source,c++] ---- template -[[nodiscard]] constexpr auto popcount(UnsignedInt x) noexcept -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto popcount(UnsignedInt x) noexcept -> int; ---- Returns the number of 1-bits in `x`. @@ -159,7 +159,7 @@ See https://en.cppreference.com/w/cpp/numeric/popcount.html[`std::popcount`]. [source,c++] ---- template -[[nodiscard]] constexpr auto byteswap(Int x) noexcept -> Int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto byteswap(Int x) noexcept -> Int; ---- Reverses the bytes of `x`. @@ -172,7 +172,7 @@ NOTE: `byteswap` is not available for `bounded_uint` types. Byte reversal can pr [source,c++] ---- template -[[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto bitswap(Int x) noexcept -> Int; ---- Reverses all bits of `x`. diff --git a/doc/modules/ROOT/pages/byte_conversions.adoc b/doc/modules/ROOT/pages/byte_conversions.adoc index 152aa3c..5696e41 100644 --- a/doc/modules/ROOT/pages/byte_conversions.adoc +++ b/doc/modules/ROOT/pages/byte_conversions.adoc @@ -170,7 +170,7 @@ The value is first converted to big-endian byte order using `to_be`, then reinte [source,c++] ---- template -[[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_be_bytes(const T value) noexcept -> std::array; ---- === Parameters @@ -203,7 +203,7 @@ The bytes are reinterpreted as the underlying type and then converted from big-e [source,c++] ---- template -[[nodiscard]] constexpr auto from_be_bytes(const std::span bytes) -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_be_bytes(const std::span bytes) -> T; ---- === Parameters @@ -251,7 +251,7 @@ The value is first converted to little-endian byte order using `to_le`, then rei [source,c++] ---- template -[[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_le_bytes(const T value) noexcept -> std::array; ---- === Parameters @@ -284,7 +284,7 @@ The bytes are reinterpreted as the underlying type and then converted from littl [source,c++] ---- template -[[nodiscard]] constexpr auto from_le_bytes(const std::span bytes) -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_le_bytes(const std::span bytes) -> T; ---- === Parameters @@ -334,7 +334,7 @@ The result is equivalent to `std::bit_cast>(val [source,c++] ---- template -[[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto to_ne_bytes(const T value) noexcept -> std::array; ---- === Parameters @@ -368,7 +368,7 @@ Delegates to `from_le_bytes` on little-endian platforms and `from_be_bytes` on b [source,c++] ---- template -[[nodiscard]] constexpr auto from_ne_bytes(const std::span bytes) -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto from_ne_bytes(const std::span bytes) -> T; ---- === Parameters diff --git a/doc/modules/ROOT/pages/charconv.adoc b/doc/modules/ROOT/pages/charconv.adoc index 599b073..6bc6af4 100644 --- a/doc/modules/ROOT/pages/charconv.adoc +++ b/doc/modules/ROOT/pages/charconv.adoc @@ -33,12 +33,14 @@ namespace boost::charconv { // Convert safe integer to character string template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto to_chars(char* first, char* last, T value, int base = 10) -> charconv::to_chars_result; // Convert character string to safe integer template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto from_chars(const char* first, const char* last, T& value, int base = 10) -> charconv::from_chars_result; @@ -57,8 +59,10 @@ struct to_chars_result char* ptr; std::errc ec; + BOOST_SAFE_NUMBERS_HOST_DEVICE friend constexpr bool operator==(const to_chars_result& lhs, const to_chars_result& rhs) noexcept = default; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr explicit operator bool() const noexcept { return ec == std::errc{}; } }; @@ -83,8 +87,11 @@ struct from_chars_result const char* ptr; std::errc ec; + BOOST_SAFE_NUMBERS_HOST_DEVICE friend constexpr bool operator==(const from_chars_result& lhs, const from_chars_result& rhs) noexcept = default; + + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr explicit operator bool() const noexcept { return ec == std::errc{}; } }; @@ -103,6 +110,7 @@ struct from_chars_result [source,c++] ---- template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto to_chars(char* first, char* last, T value, int base = 10) -> charconv::to_chars_result; @@ -133,6 +141,7 @@ Returns `boost::charconv::to_chars_result` with: [source,c++] ---- template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto from_chars(const char* first, const char* last, T& value, int base = 10) -> charconv::from_chars_result; diff --git a/doc/modules/ROOT/pages/cuda.adoc b/doc/modules/ROOT/pages/cuda.adoc index e772070..dc7ff15 100644 --- a/doc/modules/ROOT/pages/cuda.adoc +++ b/doc/modules/ROOT/pages/cuda.adoc @@ -92,5 +92,5 @@ Device error on thread 256 at /home/runner/work/safe_numbers/boost-root/libs/saf The `device_error_context` will also attempt to `printf` the error into the terminal. This works when compiling with verbose mode `-V`. -`printf` error messages will look the same as the message displayed by +`printf` error messages will look the same as the message displayed by the thrown exception diff --git a/doc/modules/ROOT/pages/integer_utilities.adoc b/doc/modules/ROOT/pages/integer_utilities.adoc index 8dc4a0e..d01f31b 100644 --- a/doc/modules/ROOT/pages/integer_utilities.adoc +++ b/doc/modules/ROOT/pages/integer_utilities.adoc @@ -23,7 +23,7 @@ These operate on the non-bounded unsigned types (`u8`, `u16`, `u32`, `u64`, `u12 [source,c++] ---- template -[[nodiscard]] constexpr auto isqrt(const T val) -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto isqrt(const T val) -> T; ---- Returns the integer square root of `val`, i.e., the largest integer `r` such that `r * r \<= val`. @@ -74,7 +74,7 @@ struct remove_trailing_zeros_return [source,c++] ---- template -[[nodiscard]] constexpr auto remove_trailing_zeros(const T n); +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto remove_trailing_zeros(const T n); ---- Removes all trailing decimal zeros from `n`. @@ -141,7 +141,7 @@ Tests whether an unsigned integer value is an exact power of 10 (i.e., one of 1, [source,c++] ---- template -[[nodiscard]] constexpr auto is_power_10(const T n) -> bool; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_10(const T n) -> bool; ---- === Parameters @@ -171,7 +171,7 @@ Returns the integer base-2 logarithm (floor of log~2~) of a value. [source,c++] ---- template -[[nodiscard]] constexpr auto ilog2(const T n) -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog2(const T n) -> int; ---- Computes `floor(log~2~(n))` using `bit_width(n) - 1`. @@ -213,7 +213,7 @@ Uses an O(1) algorithm based on the most significant bit position to approximate [source,c++] ---- template -[[nodiscard]] constexpr auto ilog10(const T n) -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog10(const T n) -> int; ---- Computes `floor(log~10~(n))` using `num_digits(n) - 1`, where `num_digits` approximates the digit count via `log~10~(x) ~= log~2~(x) / log~2~(10)` and refines with at most two comparisons against a power-of-10 lookup table. @@ -249,7 +249,7 @@ Returns the integer logarithm in an arbitrary base (floor of log~base~) of a val [source,c++] ---- template -[[nodiscard]] constexpr auto ilog(const T n, const T base) -> int; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ilog(const T n, const T base) -> int; ---- Computes `floor(log~base~(n))` by repeated division. @@ -294,7 +294,7 @@ Integer exponentiation using the exponentiation-by-squaring algorithm. [source,c++] ---- template -[[nodiscard]] constexpr auto ipow(const T a, const T b) noexcept -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto ipow(const T a, const T b) noexcept -> T; ---- Computes `a` raised to the power `b` using exponentiation by squaring. @@ -339,7 +339,7 @@ Tests whether an unsigned integer value is an exact power of 2 (i.e., has exactl [source,c++] ---- template -[[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto is_power_2(const T n) noexcept -> bool; ---- === Parameters @@ -370,7 +370,7 @@ For unsigned types, naive subtraction `a - b` when `b > a` would underflow; `abs [source,c++] ---- template -[[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto abs_diff(const T a, const T b) noexcept -> T; ---- Returns `|a - b|`, computed as `a - b` if `a >= b`, or `b - a` otherwise. @@ -411,7 +411,7 @@ For unsigned types, this is equivalent to `(a + b - 1) / b` but computed without [source,c++] ---- template -[[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto div_ceil(const T a, const T b) noexcept -> T; ---- Returns the ceiling of `a / b`. @@ -454,7 +454,7 @@ This is useful for alignment calculations (e.g., aligning a size to a page bound [source,c++] ---- template -[[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto next_multiple_of(const T a, const T b) noexcept -> T; ---- Returns the smallest value `m` such that `m >= a` and `m % b == 0`. diff --git a/doc/modules/ROOT/pages/limits.adoc b/doc/modules/ROOT/pages/limits.adoc index fe9aa3a..f273a2b 100644 --- a/doc/modules/ROOT/pages/limits.adoc +++ b/doc/modules/ROOT/pages/limits.adoc @@ -79,15 +79,15 @@ struct numeric_limits static constexpr bool tinyness_before = std::numeric_limits::tinyness_before; // Static member functions - static constexpr T min() noexcept; - static constexpr T max() noexcept; - static constexpr T lowest() noexcept; - static constexpr T epsilon() noexcept; - static constexpr T round_error() noexcept; - static constexpr T infinity() noexcept; - static constexpr T quiet_NaN() noexcept; - static constexpr T signaling_NaN() noexcept; - static constexpr T denorm_min() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T min() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T max() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T lowest() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T epsilon() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T round_error() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T infinity() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T quiet_NaN() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T signaling_NaN() noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T denorm_min() noexcept; }; } // namespace std @@ -138,63 +138,63 @@ For unsigned integer types, the following values are consistent across all speci [source,c++] ---- -static constexpr T min() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T min() noexcept; ---- Returns the minimum finite value (always `Tpass:[{0}]` for unsigned types). [source,c++] ---- -static constexpr T max() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T max() noexcept; ---- Returns the maximum finite value. [source,c++] ---- -static constexpr T lowest() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T lowest() noexcept; ---- Returns the lowest finite value (same as `min()` for unsigned types). [source,c++] ---- -static constexpr T epsilon() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T epsilon() noexcept; ---- Returns `Tpass:[{0}]` (not meaningful for integer types). [source,c++] ---- -static constexpr T round_error() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T round_error() noexcept; ---- Returns `Tpass:[{0}]` (not meaningful for integer types). [source,c++] ---- -static constexpr T infinity() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T infinity() noexcept; ---- Returns `Tpass:[{0}]` (unsigned integers cannot represent infinity). [source,c++] ---- -static constexpr T quiet_NaN() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T quiet_NaN() noexcept; ---- Returns `Tpass:[{0}]` (unsigned integers cannot represent NaN). [source,c++] ---- -static constexpr T signaling_NaN() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T signaling_NaN() noexcept; ---- Returns `Tpass:[{0}]` (unsigned integers cannot represent NaN). [source,c++] ---- -static constexpr T denorm_min() noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE static constexpr T denorm_min() noexcept; ---- Returns `Tpass:[{0}]` (not meaningful for integer types). diff --git a/doc/modules/ROOT/pages/numeric.adoc b/doc/modules/ROOT/pages/numeric.adoc index f058ebd..6899858 100644 --- a/doc/modules/ROOT/pages/numeric.adoc +++ b/doc/modules/ROOT/pages/numeric.adoc @@ -25,7 +25,7 @@ Computes the greatest common divisor of two integers using the Euclidean algorit [source,c++] ---- template -[[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto gcd(const T m, const T n) noexcept -> T; ---- Returns the greatest common divisor of `m` and `n`. @@ -65,7 +65,7 @@ Computes the least common multiple of two integers. [source,c++] ---- template -[[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto lcm(const T m, const T n) noexcept -> T; ---- Returns the least common multiple of `m` and `n`. @@ -106,7 +106,7 @@ The result is rounded towards the first argument `a`. [source,c++] ---- template -[[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T; +BOOST_SAFE_NUMBERS_HOST_DEVICE [[nodiscard]] constexpr auto midpoint(const T a, const T b) noexcept -> T; ---- Returns the midpoint of `a` and `b`, computed without overflow. diff --git a/doc/modules/ROOT/pages/unsigned_integers.adoc b/doc/modules/ROOT/pages/unsigned_integers.adoc index b0f12bc..842252f 100644 --- a/doc/modules/ROOT/pages/unsigned_integers.adoc +++ b/doc/modules/ROOT/pages/unsigned_integers.adoc @@ -44,103 +44,120 @@ public: using basis_type = BasisType; // Construction - constexpr unsigned_integer_basis() noexcept = default; - explicit constexpr unsigned_integer_basis(BasisType val) noexcept; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr unsigned_integer_basis() noexcept = default; + BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(BasisType val) noexcept; template requires std::is_same_v - explicit constexpr unsigned_integer_basis(T) noexcept = delete; // bool prohibited + BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(T) noexcept = delete; // bool prohibited // Conversion to underlying types template - explicit constexpr operator OtherBasis() const; + BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr operator OtherBasis() const; // Comparison operators + BOOST_SAFE_NUMBERS_HOST_DEVICE friend constexpr auto operator<=>(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> std::strong_ordering = default; // Compound assignment operators (arithmetic) template + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator*=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator/=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator%=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; // Compound assignment operators (bitwise) - constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; - constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; - constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; - constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; - constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; // Increment and decrement operators - constexpr auto operator++() -> unsigned_integer_basis&; - constexpr auto operator++(int) -> unsigned_integer_basis; - constexpr auto operator--() -> unsigned_integer_basis&; - constexpr auto operator--(int) -> unsigned_integer_basis; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++() -> unsigned_integer_basis&; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++(int) -> unsigned_integer_basis; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--() -> unsigned_integer_basis&; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--(int) -> unsigned_integer_basis; // Unary operators - constexpr auto operator+() const noexcept -> unsigned_integer_basis; - constexpr auto operator-() const noexcept; // compile-time error + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+() const noexcept -> unsigned_integer_basis; + BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-() const noexcept; // compile-time error }; // class unsigned_integer_basis // Arithmetic operators (throw on overflow/underflow) template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator*(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator/(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator%(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; // Bitwise operators template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator~(unsigned_integer_basis lhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator&(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator|(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator^(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator<<(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator>>(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; @@ -269,7 +286,7 @@ constexpr auto shr(T lhs, T rhs); [source,c++] ---- -constexpr unsigned_integer_basis() noexcept = default; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr unsigned_integer_basis() noexcept = default; ---- Values are default-initialized to zero. @@ -278,7 +295,7 @@ Values are default-initialized to zero. [source,c++] ---- -explicit constexpr unsigned_integer_basis(BasisType val) noexcept; +BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(BasisType val) noexcept; ---- Construction from the underlying type is explicit to prevent accidental conversions. @@ -289,7 +306,7 @@ Construction from the underlying type is explicit to prevent accidental conversi ---- template requires std::is_same_v -explicit constexpr unsigned_integer_basis(T) noexcept = delete; +BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr unsigned_integer_basis(T) noexcept = delete; ---- Constructing from `bool` is a compile-time error. @@ -299,7 +316,7 @@ Constructing from `bool` is a compile-time error. [source,c++] ---- template -explicit constexpr operator OtherBasis() const; +BOOST_SAFE_NUMBERS_HOST_DEVICE explicit constexpr operator OtherBasis() const; ---- Conversion to other unsigned integral types is explicit. @@ -311,6 +328,7 @@ This allows safe narrowing when the value is known to fit at runtime. [source,c++] ---- +BOOST_SAFE_NUMBERS_HOST_DEVICE friend constexpr auto operator<=>(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> std::strong_ordering = default; ---- @@ -323,22 +341,27 @@ All comparison operators (`<`, `<=`, `>`, `>=`, `==`, `!=`) are available. [source,c++] ---- template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator*(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator/(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator%(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; ---- @@ -356,18 +379,23 @@ All arithmetic operators perform runtime checks and throw exceptions when undefi [source,c++] ---- template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator*=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator/=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator%=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; ---- @@ -378,30 +406,36 @@ Compound assignment operators follow the same exception behavior as their corres [source,c++] ---- template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator~(unsigned_integer_basis lhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator&(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator|(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator^(unsigned_integer_basis lhs, unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator<<(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; template +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator>>(unsigned_integer_basis lhs, unsigned_integer_basis rhs) -> unsigned_integer_basis; @@ -473,11 +507,11 @@ All shift policy functions are `noexcept`. [source,c++] ---- -constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; -constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; -constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; -constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; -constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator&=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator|=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator^=(unsigned_integer_basis rhs) noexcept -> unsigned_integer_basis&; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator<<=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator>>=(unsigned_integer_basis rhs) -> unsigned_integer_basis&; ---- Compound bitwise assignment operators delegate to the corresponding free-function bitwise operators and follow the same exception behavior. @@ -488,10 +522,10 @@ Compound bitwise assignment operators delegate to the corresponding free-functio [source,c++] ---- -constexpr auto operator++() -> unsigned_integer_basis&; -constexpr auto operator++(int) -> unsigned_integer_basis; -constexpr auto operator--() -> unsigned_integer_basis&; -constexpr auto operator--(int) -> unsigned_integer_basis; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++() -> unsigned_integer_basis&; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator++(int) -> unsigned_integer_basis; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--() -> unsigned_integer_basis&; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator--(int) -> unsigned_integer_basis; ---- - `++` (pre/post): Throws `std::overflow_error` if the value is already at the maximum @@ -501,8 +535,8 @@ constexpr auto operator--(int) -> unsigned_integer_basis; [source,c++] ---- -constexpr auto operator+() const noexcept -> unsigned_integer_basis; -constexpr auto operator-() const noexcept; // compile-time error +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator+() const noexcept -> unsigned_integer_basis; +BOOST_SAFE_NUMBERS_HOST_DEVICE constexpr auto operator-() const noexcept; // compile-time error ---- - `+`: Returns a copy of the value (identity). This is consistent with built-in unsigned integer behavior.