From 203bd06bef2860586bce6bb6ced4fc4218827609 Mon Sep 17 00:00:00 2001 From: Jacob Faibussowitsch Date: Wed, 1 Jul 2026 15:00:19 -0400 Subject: [PATCH 1/7] Fix CUB DeviceReduce env overloads to accept no_init_t --- cub/cub/device/device_reduce.cuh | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 42079bcbf73..306f6d83bb2 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -216,14 +216,16 @@ private: } //! @brief Internal implementation shared by Reduce and TransformReduce env overloads - template + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ReductionOpT, + typename TransformOpT, + typename T, + typename NumItemsT, + typename EnvT, + typename AccumT = decltype(detail::reduce::template select_accum_t( + static_cast(nullptr)))> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __transform_reduce( InputIteratorT d_in, OutputIteratorT d_out, @@ -543,8 +545,7 @@ public: InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce"); - using accum_t = ::cuda::std::__accumulator_t, T>; - return __transform_reduce(d_in, d_out, num_items, reduction_op, ::cuda::std::identity{}, init, env); + return __transform_reduce(d_in, d_out, num_items, reduction_op, ::cuda::std::identity{}, init, env); } //! @rst @@ -619,9 +620,8 @@ public: { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum"); using OutputT = cub::detail::non_void_value_t>; - using accum_t = ::cuda::std::__accumulator_t<::cuda::std::plus<>, cub::detail::it_value_t, OutputT>; - return __transform_reduce( - d_in, d_out, num_items, ::cuda::std::plus<>{}, ::cuda::std::identity{}, OutputT{}, env); + + return __transform_reduce(d_in, d_out, num_items, ::cuda::std::plus<>{}, ::cuda::std::identity{}, OutputT{}, env); } //! @rst @@ -2211,9 +2211,7 @@ public: EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::TransformReduce"); - using accum_t = ::cuda::std:: - __accumulator_t>, T>; - return __transform_reduce(d_in, d_out, num_items, reduction_op, transform_op, init, env); + return __transform_reduce(d_in, d_out, num_items, reduction_op, transform_op, init, env); } //! @rst From 50265981e1848596f73b1c2893a60841106dac8e Mon Sep 17 00:00:00 2001 From: Jacob Faibussowitsch Date: Thu, 2 Jul 2026 09:48:43 -0400 Subject: [PATCH 2/7] fixup! Fix CUB DeviceReduce env overloads to accept no_init_t --- cub/test/catch2_test_device_reduce_env_api.cu | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cub/test/catch2_test_device_reduce_env_api.cu b/cub/test/catch2_test_device_reduce_env_api.cu index 47eaef43c91..c32e78900a6 100644 --- a/cub/test/catch2_test_device_reduce_env_api.cu +++ b/cub/test/catch2_test_device_reduce_env_api.cu @@ -656,3 +656,15 @@ C2H_TEST("cub::DeviceReduce::Sum queries both stream and resource from composed REQUIRE(bytes_allocated > 0); REQUIRE(bytes_deallocated == bytes_allocated); } + +C2H_TEST("cub::DeviceReduce::Reduce allows no_init in env overloads", "[reduce][env]") +{ + auto input = thrust::device_vector{1, 2, 3, 4, 5}; + auto output = thrust::device_vector(1); + + auto error = cub::DeviceReduce::Reduce( + input.begin(), output.begin(), static_cast(input.size()), cuda::std::plus<>{}, cub::detail::reduce::no_init); + + REQUIRE(error == cudaSuccess); + REQUIRE(output[0] == 15); +} From 0c4f9a90322eae0a9ce433ba44ae8b77f93387fb Mon Sep 17 00:00:00 2001 From: Jacob Faibussowitsch Date: Thu, 2 Jul 2026 10:16:34 -0400 Subject: [PATCH 3/7] fixup! Fix CUB DeviceReduce env overloads to accept no_init_t --- cub/cub/device/device_reduce.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 306f6d83bb2..f5904cb8e2e 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -160,8 +160,8 @@ private: EnvT env) { using offset_t = detail::choose_offset_t; - using accum_t = ::cuda::std:: - __accumulator_t>, T>; + using accum_t = decltype(detail::reduce::template select_accum_t( + static_cast(nullptr))); if constexpr (Determinism == ::cuda::execution::determinism::__determinism_t::__gpu_to_gpu) { From 3712e5e147d2af15a8b347c279cbf40d64aeb807 Mon Sep 17 00:00:00 2001 From: Jacob Faibussowitsch Date: Fri, 12 Jun 2026 11:51:20 -0400 Subject: [PATCH 4/7] Add multi-GPU reduce --- .../__multi_gpu/algorithm/common.h | 155 ++++++++ .../__multi_gpu/algorithm/reduce/reduce.h | 349 ++++++++++++++++++ .../cuda/experimental/__multi_gpu/concepts.h | 14 + cudax/test/multi_gpu/CMakeLists.txt | 1 + .../test/multi_gpu/algorithms/CMakeLists.txt | 11 + .../algorithms/reduce/CMakeLists.txt | 20 + .../test/multi_gpu/algorithms/reduce/basic.cu | 322 ++++++++++++++++ .../multi_gpu/communicators/nccl/basic.cu | 2 +- .../communicators/nccl/collective.cu | 18 +- .../communicators/nccl/nccl_test_helpers.cuh | 2 +- .../communicators/nccl/peer_to_peer.cu | 4 +- libcudacxx/include/cuda/__container/buffer.h | 13 +- 12 files changed, 897 insertions(+), 14 deletions(-) create mode 100644 cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h create mode 100644 cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h create mode 100644 cudax/test/multi_gpu/algorithms/CMakeLists.txt create mode 100644 cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt create mode 100644 cudax/test/multi_gpu/algorithms/reduce/basic.cu diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h new file mode 100644 index 00000000000..b85367cbd04 --- /dev/null +++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h @@ -0,0 +1,155 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H +#define _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// NOLINTBEGIN(bugprone-reserved-identifier) + +namespace cuda::experimental::__detail +{ +#define __CUDAX_MULTI_GPU_DISPATCH(__logical_device, __count, __call, __arguments) \ + do \ + { \ + const auto __cur_context = ::cuda::__ensure_current_context{(__logical_device).context()}; \ + auto __status = ::cudaError_t{}; \ + \ + THRUST_INDEX_TYPE_DISPATCH(__status, __call, __count, __arguments); \ + THRUST_NS_QUALIFIER::cuda_cub::throw_on_error(__status, /*msg=*/"performing " #__call #__arguments); \ + } while (0) + +#define __CUDAX_MULTI_GPU_DOUBLE_DISPATCH(__logical_device, __count1, __count2, __call, __arguments) \ + do \ + { \ + const auto __cur_context = ::cuda::__ensure_current_context{(__logical_device).context()}; \ + auto __status = ::cudaError_t{}; \ + \ + THRUST_DOUBLE_INDEX_TYPE_DISPATCH(__status, __call, __count1, __count2, __arguments); \ + THRUST_NS_QUALIFIER::cuda_cub::throw_on_error(__status, /*msg=*/"performing " #__call #__arguments); \ + } while (0) + +template +[[nodiscard]] _CCCL_HOST_API constexpr ::cuda::stream_ref __stream_from_env(const _Env& __env) +{ + return ::cuda::__lazy_call_or( + ::cuda::get_stream, + [] { + return ::cuda::stream_ref{::CUstream{}}; + }, + __env); +} + +template +[[nodiscard]] _CCCL_HOST_API constexpr decltype(auto) __resource_from_env(const _Env& __env, ::cuda::device_ref __device) +{ + return ::cuda::__lazy_call_or( + ::cuda::mr::get_memory_resource, + [&] { + return ::cuda::device_default_memory_pool(__device); + }, + __env); +} + +template +[[nodiscard]] _CCCL_HOST_API constexpr decltype(auto) __sanitize_buffer_env(const _Env& __env) +{ + if constexpr (::cuda::__buffer_compatible_env<_Env>) + { + return __env; + } + else + { + return ::cuda::std::execution::env<>{}; + } +} + +template +[[nodiscard]] _CCCL_HOST_API constexpr auto __make_safe_uninitialized_buffer( + ::cuda::stream_ref __stream, _Resource&& __resource, ::cuda::std::size_t __size, const _Env& __env) +{ + if constexpr (::cuda::std::is_trivially_constructible_v<_Tp>) + { + return ::cuda::make_buffer<_Tp>( + __stream, ::cuda::std::forward<_Resource>(__resource), __size, ::cuda::no_init, __sanitize_buffer_env(__env)); + } + else + { + return ::cuda::make_buffer<_Tp>( + __stream, ::cuda::std::forward<_Resource>(__resource), __size, _Tp{}, __sanitize_buffer_env(__env)); + } +} + +template +struct __in_range_out_it_properties +{ + using __input_type = ::cuda::std::ranges::range_value_t<::cuda::std::ranges::range_reference_t<_InputRangeOfRanges>>; + using __output_type = ::cuda::std::iter_value_t<::cuda::std::ranges::range_reference_t<_RangeOfOutputIt>>; + + using __env_type = ::cuda::std::ranges::range_value_t<_EnvRange>; + + using __resource_type = ::cuda::std::remove_cvref_t(), ::cuda::std::declval<::cuda::device_ref>()))>; + + using __buffer_type = ::cuda::__buffer_type_for_props<__output_type, typename __resource_type::default_queries>; +}; + +template +_CCCL_CONCEPT __range_of_sized_random_access_ranges = _CCCL_REQUIRES_EXPR((_RangeOfRanges), )( + requires(::cuda::std::ranges::forward_range<_RangeOfRanges>), + requires(::cuda::std::ranges::sized_range<_RangeOfRanges>), + requires(::cuda::std::ranges::random_access_range<::cuda::std::ranges::range_reference_t<_RangeOfRanges>>)); + +template +_CCCL_CONCEPT __range_of_output_iters = _CCCL_REQUIRES_EXPR((_RangeOfIters, _Tp), )( + requires(::cuda::std::ranges::forward_range<_RangeOfIters>), + requires( + ::cuda::std::output_iterator<::cuda::std::remove_cvref_t<::cuda::std::ranges::range_reference_t<_RangeOfIters>>, + _Tp>)); +} // namespace cuda::experimental::__detail + +// NOLINTEND(bugprone-reserved-identifier) + +#include + +#endif // _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h new file mode 100644 index 00000000000..778414b6af4 --- /dev/null +++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h @@ -0,0 +1,349 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H +#define _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +// NOLINTBEGIN(bugprone-reserved-identifier) + +namespace cuda::experimental +{ +namespace __detail::__reduce +{ +template +struct __partial_redop +{ + using __buffer_type = _Buffer; + using __env_type = _Env; + + _Buffer __buffer; + _Env __env; + ::cuda::stream_ref __stream; +}; + +template +[[nodiscard]] _CCCL_HOST_API __partial_redop<_Buffer, _Env> __local_reduction( + const ::cuda::std::int32_t __ROOT_RANK, + _Comm&& __comm, + _Env __env, + _InputRange&& __inputs, + const _Tp& __init, + _BinaryOp __op) +{ + const auto& __logical_device = __comm.logical_device(); + auto __stream = __stream_from_env(__env); + auto __resource = __resource_from_env(__env, __logical_device.underlying_device()); + + static_assert(::cuda::mr::resource_with, + "Provided memory resource must be device accessible"); + + const auto __num_items = ::cuda::std::ranges::size(__inputs); + // Allocate enough storage so that we can use the buffer directly in an in-place comm all + // gather/all reduce call. Those calls require that the receive buffer is of size nranks * + // sendcount. + auto __buff = __make_safe_uninitialized_buffer<_Tp>(__stream, __resource, __comm.size(), __env); + static_assert(::cuda::std::same_as); + + if (const auto __rank = __comm.rank(); __rank == __ROOT_RANK) + { + __CUDAX_MULTI_GPU_DISPATCH( + __logical_device, + __num_items, + CUB_NS_QUALIFIER::DeviceReduce::Reduce, + (::cuda::std::ranges::begin(__inputs), + // Similarly to above, prepare for the comm calls later. In order for those to be + // in-place, the sendbuff = recvbuff + rank, so we need to place our partial result + // there + __buff.begin() + __rank, + __num_items_fixed, + ::cuda::std::move(__op), + __init, + __env)); + } + else + { + __CUDAX_MULTI_GPU_DISPATCH( + __logical_device, + __num_items, + CUB_NS_QUALIFIER::DeviceReduce::Reduce, + (::cuda::std::ranges::begin(__inputs), + // Similarly to above, prepare for the com calls later. In order for those to be + // in-place, the sendbuff = recvbuff + rank, so we need to place our partial result + // there + __buff.begin() + __rank, + __num_items_fixed, + ::cuda::std::move(__op), + CUB_NS_QUALIFIER::detail::reduce::no_init, + __env)); + } + return {::cuda::std::move(__buff), ::cuda::std::move(__env), __stream}; +} + +template +void __direct_reduction(_CommRange&& __comms, + const ::std::vector<_PartialType>& __partials, + _RangeOfOutputIt&& __outputs, + const _BinaryOp& __op) +{ + auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard(); + + for (const auto [__comm, __local, __out_it] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs)) + { + __comm.all_reduce( + __guard, + __local.__buffer.data() + __comm.rank(), + ::cuda::std::to_address(__out_it), + /*__count=*/1, + __op, + __local.__stream); + } +} + +template +void __two_stage_gather_reduction( + _CommRange&& __comms, + const ::std::vector<_PartialType>& __partials, + _RangeOfOutputIt&& __outputs, + const _BinaryOp& __op) +{ + { + auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard(); + + for (const auto& [__comm, __local] : ::cuda::std::ranges::views::zip(__comms, __partials)) + { + auto* const __ptr = __local.__buffer.data(); + + __comm.all_gather(__guard, __ptr + __comm.rank(), __ptr, /*__count=*/1, __local.__stream); + } + } + + for (auto&& [__comm, __part, __out] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs)) + { + auto&& [__buffer, __env, _] = __part; + const auto __num_items = __buffer.size(); + + __CUDAX_MULTI_GPU_DISPATCH( + __comm.logical_device(), + __num_items, + CUB_NS_QUALIFIER::DeviceReduce::Reduce, + (__buffer.begin(), __out, __num_items_fixed, __op, CUB_NS_QUALIFIER::detail::reduce::no_init, __env)); + } +} +} // namespace __detail::__reduce + +//! @brief Reduce each input range over its communicator and write one result per output +//! iterator. +//! +//! Performs one reduction per communicator in parallel across devices. The communicators, +//! environments, input ranges and output iterators are iterated in lockstep, so for the +//! i-th element of each range the i-th input range is reduced with `__op` seeded by +//! `__init` on the i-th communicator's devices, the partial results are combined across +//! all ranks of that communicator, and the final value is written through the i-th output +//! iterator. +//! +//! This routine is used when the current thread or process owns multiple local GPUs. For +//! example, consider a scenario where there are 8 GPUs and 4 processes such that each +//! process owns 2 GPUs. Then the user would call this routine on each process, passing in both +//! local arrays: +//! @code{.cpp} +//! device_buffer gpu_0_data = ...; +//! device_buffer gpu_1_data = ...; +//! +//! cudax::reduce({comm0, comm1}, +//! {env_0, env_1}, +//! {gpu_0_data, gpu_1_data}, +//! {out_0, out_1}, +//! ...) +//! @endcode +//! +//! All ranges must have the same length. The algorithm will cap iteration to the shortest +//! length, but this should not be relied upon and may change at any time, for any reason. So +//! differing lengths is effectively undefined behavior. +//! +//! After this call returns, all local output iterators will hold the same value. In that sense +//! this routine is similar to an "all reduce". +//! +//! @tparam _CommRange The range of communicators. Each element must model the communicator +//! concept. +//! @tparam _EnvRange The range of execution environments. Each environment supplies the +//! stream and memory resource used for its communicator. +//! @tparam _InputRangeOfRanges The range whose elements are the per-communicator input +//! ranges. Each element must be a sized random-access range. +//! @tparam _RangeOfOutputIt The range of output iterators, one per communicator. +//! @tparam _Tp The reduction and result value type. Deduced by default from the output +//! element type. +//! @tparam _BinaryOp The binary reduction operator type. Defaults to `::cuda::std::plus<>`. +//! +//! @param[in] __comms The range of communicators. +//! @param[in] __envs The range of execution environments. +//! @param[in] __range_of_inputs The range of per-communicator input ranges to reduce. +//! @param[out] __outputs The range of output iterators receiving the per-communicator results. +//! @param[in] __init The initial value seeding each reduction. +//! @param[in] __op The binary reduction operator. +_CCCL_TEMPLATE(class _CommRange, + class _EnvRange, + class _InputRangeOfRanges, + class _RangeOfOutputIt, + class _Tp = ::cuda::std::iter_value_t<::cuda::std::ranges::range_reference_t<_RangeOfOutputIt>>, + class _BinaryOp = ::cuda::std::plus<>) +_CCCL_REQUIRES(__range_of_communicators<_CommRange> _CCCL_AND ::cuda::std::ranges::forward_range<_EnvRange> _CCCL_AND + __detail::__range_of_sized_random_access_ranges<_InputRangeOfRanges> _CCCL_AND + __detail::__range_of_output_iters<_RangeOfOutputIt, _Tp>) +_CCCL_HOST_API void reduce( + _CommRange&& __comms, + _EnvRange&& __envs, + _InputRangeOfRanges&& __range_of_inputs, + _RangeOfOutputIt&& __outputs, + _Tp __init = {}, + _BinaryOp __op = {}) +{ + static_assert(::cuda::std::ranges::sized_range<_CommRange>); + + using __properties = + ::cuda::experimental::__detail::__in_range_out_it_properties<_InputRangeOfRanges, _RangeOfOutputIt, _EnvRange>; + using __partial_type = ::cuda::experimental::__detail::__reduce::__partial_redop; + + const auto __num_local = ::cuda::std::ranges::size(__comms); + + if (!__num_local) + { + return; + } + + auto __partials = ::std::vector<__partial_type>{}; + + __partials.reserve(__num_local); + // TODO(jfaibussowit): can just be ranges::zip | ranges::transform | ranges::to() (and then + // we don't need to do the env, and buffer type deduction upfront) + for (auto&& [__comm, __env, __inputs] : ::cuda::std::ranges::views::zip(__comms, __envs, __range_of_inputs)) + { + __partials.emplace_back( + ::cuda::experimental::__detail::__reduce::__local_reduction( + /*__ROOT_RANK=*/0, __comm, __env, __inputs, __init, __op)); + } + + if constexpr (::cuda::experimental::__has_all_reduce<::cuda::std::ranges::range_value_t<_CommRange>, + typename __properties::__output_type*>) + { + __direct_reduction(__comms, __partials, __outputs, __op); + } + else + { + __two_stage_gather_reduction(__comms, __partials, __outputs, __op); + } +} + +//! @brief Reduce a single input range over a single communicator using the given execution +//! environment. +//! +//! Convenience wrapper that forwards a single `(communicator, environment, input range, +//! output iterator)` to the range-based overload. The input range is reduced with `__op` +//! seeded by `__init` across the communicator's ranks and the final value is written +//! through `__output`. +//! +//! @tparam _Comm The communicator type. Must model the communicator concept. +//! @tparam _Env The execution environment type. Supplies the stream and memory resource. +//! @tparam _InputRange The input range type. Must be a random-access range. +//! @tparam _OutputIt The output iterator type. +//! @tparam _Tp The reduction and result value type. Deduced by default from the output +//! value type. +//! @tparam _BinaryOp The binary reduction operator type. Defaults to `::cuda::std::plus<>`. +//! +//! @param[in] __comm The communicator. +//! @param[in] __env The execution environment. +//! @param[in] __inputs The input range to reduce. +//! @param[out] __output The output iterator receiving the result. +//! @param[in] __init The initial value seeding the reduction. +//! @param[in] __op The binary reduction operator. +_CCCL_TEMPLATE(class _Comm, + class _Env, + class _InputRange, + class _OutputIt, + class _Tp = ::cuda::std::iter_value_t<_OutputIt>, + class _BinaryOp = ::cuda::std::plus<>) +_CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange> + _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>) +_CCCL_HOST_API void +reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {}) +{ + reduce(::cuda::std::span<::cuda::std::remove_reference_t<_Comm>, 1>{::cuda::std::addressof(__comm), 1}, + ::cuda::std::span<::cuda::std::remove_reference_t<_Env>, 1>{::cuda::std::addressof(__env), 1}, + ::cuda::std::span<::cuda::std::remove_reference_t<_InputRange>, 1>{::cuda::std::addressof(__inputs), 1}, + ::cuda::std::span<::cuda::std::remove_reference_t<_OutputIt>, 1>{::cuda::std::addressof(__output), 1}, + ::cuda::std::move(__init), + ::cuda::std::move(__op)); +} + +//! @brief Reduce a single input range over a single communicator using +//! `::cuda::std::plus<>` and a default-constructed initial value. +//! +//! Convenience wrapper that supplies a default execution environment and a default +//! `::cuda::std::plus<>` reduction, forwarding to the environment-taking overload. The +//! result value type is the input range's value type. +//! +//! @tparam _Comm The communicator type. Must model the communicator concept. +//! @tparam _InputRange The input range type. Must be a random-access range. +//! @tparam _OutputIt The output iterator type. +//! +//! @param[in] __comm The communicator. +//! @param[in] __inputs The input range to reduce. +//! @param[out] __output The output iterator receiving the result. +_CCCL_TEMPLATE(class _Comm, class _InputRange, class _OutputIt) +_CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange> + _CCCL_AND ::cuda::std::output_iterator<_OutputIt, ::cuda::std::ranges::range_value_t<_InputRange>>) +_CCCL_HOST_API void reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output) +{ + reduce(::cuda::std::forward<_Comm>(__comm), + ::cuda::std::execution::env<>{}, + ::cuda::std::forward<_InputRange>(__inputs), + ::cuda::std::forward<_OutputIt>(__output)); +} +} // namespace cuda::experimental + +// NOLINTEND(bugprone-reserved-identifier) + +#include + +#endif // _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H diff --git a/cudax/include/cuda/experimental/__multi_gpu/concepts.h b/cudax/include/cuda/experimental/__multi_gpu/concepts.h index 2d805553aee..5b3a9f20e2f 100644 --- a/cudax/include/cuda/experimental/__multi_gpu/concepts.h +++ b/cudax/include/cuda/experimental/__multi_gpu/concepts.h @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include @@ -214,6 +216,18 @@ _CCCL_CONCEPT __has_all_to_all_v = _CCCL_REQUIRES_EXPR( __recv_counts, __recv_displs, __stream)); + +// ========================================================================================== + +template +_CCCL_CONCEPT __range_of_communicators = _CCCL_REQUIRES_EXPR((_Range), )( + requires(::cuda::std::ranges::forward_range<_Range>), + requires(__communicator<::cuda::std::remove_cvref_t<::cuda::std::ranges::range_reference_t<_Range>>>)); + +template +_CCCL_CONCEPT __range_of_streams = _CCCL_REQUIRES_EXPR((_Range), )( + requires(::cuda::std::ranges::forward_range<_Range>), + requires(::cuda::std::ranges::__container_compatible_range<_Range, ::cuda::stream_ref>)); } // namespace cuda::experimental // NOLINTEND(bugprone-reserved-identifier) diff --git a/cudax/test/multi_gpu/CMakeLists.txt b/cudax/test/multi_gpu/CMakeLists.txt index 0bef8b2bc2e..f617ecf5e3b 100644 --- a/cudax/test/multi_gpu/CMakeLists.txt +++ b/cudax/test/multi_gpu/CMakeLists.txt @@ -63,6 +63,7 @@ function(cudax_add_multi_gpu_test sub_prefix target_name_var source) set(${target_name_var} ${test_target} PARENT_SCOPE) endfunction() +add_subdirectory(algorithms) add_subdirectory(communicators) add_subdirectory(concepts) add_subdirectory(nccl) diff --git a/cudax/test/multi_gpu/algorithms/CMakeLists.txt b/cudax/test/multi_gpu/algorithms/CMakeLists.txt new file mode 100644 index 00000000000..279f970b1af --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/CMakeLists.txt @@ -0,0 +1,11 @@ +#===----------------------------------------------------------------------===## +# +# Part of CUDA Experimental in CUDA C++ Core Libraries, +# under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +# +#===----------------------------------------------------------------------===## + +add_subdirectory(reduce) diff --git a/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt b/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt new file mode 100644 index 00000000000..1a43a0be4f8 --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt @@ -0,0 +1,20 @@ +#===----------------------------------------------------------------------===## +# +# Part of CUDA Experimental in CUDA C++ Core Libraries, +# under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +# +#===----------------------------------------------------------------------===## + +if (NOT cudax_ENABLE_NCCL) + return() +endif() + +file(GLOB test_srcs LIST_DIRECTORIES FALSE CONFIGURE_DEPENDS *.cu *.cpp) + +foreach (src IN LISTS test_srcs) + cudax_add_multi_gpu_test("algorithms.reduce" test_target "${src}") + target_link_libraries(${test_target} PRIVATE NCCL::nccl) +endforeach() diff --git a/cudax/test/multi_gpu/algorithms/reduce/basic.cu b/cudax/test/multi_gpu/algorithms/reduce/basic.cu new file mode 100644 index 00000000000..babffa8ff50 --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/reduce/basic.cu @@ -0,0 +1,322 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +#include "../../communicators/nccl/nccl_test_helpers.cuh" + +namespace +{ +using custom_value = c2h::custom_type_t; +static_assert(cudax::nccl_transportable); + +template +T make_value(int i) +{ + return static_cast(i); +} + +template <> +custom_value make_value<>(int i) +{ + return {static_cast(i), static_cast(i)}; +}; + +// A custom reduction operator equivalent to `cuda::std::plus<>`. +struct custom_plus +{ + template + _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const + { + return lhs + rhs; + } +}; + +using value_types = c2h::type_list; +using operators = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>, custom_plus>; + +// One output iterator per local output buffer. Collected after `out` is fully built so the +// iterators do not dangle across reallocations. +template +[[nodiscard]] auto make_output_iterators(OutBuffers& out) +{ + std::vector::iterator> outputs; + + outputs.reserve(out.size()); + for (auto& buf : out) + { + outputs.push_back(buf.begin()); + } + return outputs; +} + +// Run the full reduction, wait for it to finish, and check that `reduce` left its argument ranges +// untouched. This boilerplate is identical for every test regardless of how the inputs are shaped. +template +void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init, Op op, Streams& streams) +{ + using value_type = typename cuda::std::remove_cvref_t::value_type; + + // cuda::std::execution::env has no operator==, so we can only compare the sizes. + const auto envs_size = envs.size(); + + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + std::vector> in_copy; + in_copy.reserve(in.size()); + for (const auto& buf : in) + { + in_copy.emplace_back(cuda::make_buffer(buf.stream(), pool, buf)); + } + + const auto outputs_copy = outputs; + + cudax::reduce(comms, envs, in, outputs, init, op); + + for (auto& stream : streams) + { + stream.sync(); + } + + REQUIRE(envs.size() == envs_size); + for (cuda::std::size_t i = 0; i < in.size(); ++i) + { + const auto actual = cuda::make_buffer(in[i].stream(), pool, in[i]); + REQUIRE_THAT(actual, Equals(in_copy[i])); + } + REQUIRE(outputs == outputs_copy); +} +} // namespace + +MGMN_TEST("reduce, one element per rank", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + // Seed each reduction with a few hardcoded initializers. The init participates in the fold the + // same way on host and device, so any value works for every operator under test. + const T init = make_value(GENERATE(0, 1, -1, 5)); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // Global rank `comms[i].rank()` contributes the single value `rank`. Each local rank also gets a + // one-element output buffer and an environment carrying its stream, so the reduction is + // stream-ordered on the correct device. `reference` mirrors the contributions of every global + // rank so we can fold them on the host exactly like `reduce` does on the device. + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (int i = 0; i < comms.size(); ++i) + { + const auto values = {make_value(comms[i].rank())}; + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + reference.push_back(make_value(r)); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, Op{}, streams); + + const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); + + for (const auto& buf : out) + { + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + + REQUIRE(actual.size() == 1); + REQUIRE(actual.front() == expected); + } +} + +MGMN_TEST("reduce, multiple elements per rank", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + // Seed each reduction with a few hardcoded initializers. The init participates in the fold the + // same way on host and device, so any value works for every operator under test. + const T init = make_value(GENERATE(0, 1, -1, 5)); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // Global rank `comms[i].rank()` contributes `{rank, rank, rank}`. `reduce` first does a local CUB + // reduction of each rank's range, then combines the partials across ranks. Each local rank also + // gets a one-element output buffer and an environment carrying its stream. `reference` mirrors + // every global rank's three contributions for the host-side fold. + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (int i = 0; i < comms.size(); ++i) + { + const auto v = make_value(comms[i].rank()); + const auto values = {v, v, v}; + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + const auto v = make_value(r); + reference.insert(reference.end(), {v, v, v}); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, Op{}, streams); + + const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); + + for (const auto& buf : out) + { + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + + REQUIRE(actual.size() == 1); + REQUIRE(actual.front() == expected); + } +} + +MGMN_TEST("reduce, some ranks empty", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + const T init = make_value(GENERATE(0, 1, -1, 5)); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // Even global ranks contribute two copies of `rank`; odd global ranks contribute an empty input + // range. Rank 0 (the reduction root) is always non-empty. `reduce` must treat an empty rank as + // contributing nothing, exactly like `std::accumulate` over the surviving elements. `reference` + // mirrors that for the host-side fold. + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (int i = 0; i < comms.size(); ++i) + { + const auto rank = comms[i].rank(); + if (rank % 2 == 0) + { + const auto values = {make_value(rank), make_value(rank)}; + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + } + else + { + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device())); + } + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + + for (int r = 0; r < comms.front().size(); r += 2) + { + reference.push_back(make_value(r)); + reference.push_back(make_value(r)); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, Op{}, streams); + + const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); + + for (const auto& buf : out) + { + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + + REQUIRE(actual.size() == 1); + REQUIRE(actual.front() == expected); + } +} + +MGMN_TEST("reduce, all ranks empty", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + const T init = make_value(GENERATE(0, 1, -1, 5)); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // No rank contributes any element. Reducing nothing seeded by `init` is just `init`, so every + // output must equal `init` regardless of the operator. + std::vector> in; + std::vector> out; + std::vector envs; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (int i = 0; i < comms.size(); ++i) + { + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device())); + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, Op{}, streams); + + // Reducing nothing seeded by `init` yields `init`, exactly like `std::accumulate` over an empty + // range. + const T expected = init; + + for (const auto& buf : out) + { + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + + REQUIRE(actual.size() == 1); + REQUIRE(actual.front() == expected); + } +} diff --git a/cudax/test/multi_gpu/communicators/nccl/basic.cu b/cudax/test/multi_gpu/communicators/nccl/basic.cu index 1ba084b8adf..f21f209e18b 100644 --- a/cudax/test/multi_gpu/communicators/nccl/basic.cu +++ b/cudax/test/multi_gpu/communicators/nccl/basic.cu @@ -32,7 +32,7 @@ C2H_TEST("nccl_communicator_ref not constructible from NCCL_COMM_NULL", "[multi_ STATIC_REQUIRE(!::cuda::std::is_constructible_v); } -NCCL_COMM_TEST("nccl_communicator_ref basic") +MGMN_TEST("nccl_communicator_ref basic", ) { SECTION("rank and size") { diff --git a/cudax/test/multi_gpu/communicators/nccl/collective.cu b/cudax/test/multi_gpu/communicators/nccl/collective.cu index 92a1402f721..96f73edfa21 100644 --- a/cudax/test/multi_gpu/communicators/nccl/collective.cu +++ b/cudax/test/multi_gpu/communicators/nccl/collective.cu @@ -26,7 +26,7 @@ namespace constexpr cuda::std::int32_t ROOT_RANK = 0; } // namespace -NCCL_COMM_TEST("nccl_communicator_ref all_reduce sum") +MGMN_TEST("nccl_communicator_ref all_reduce sum", ) { auto streams = nccl_test_util::make_streams(); @@ -69,7 +69,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_reduce sum") } } -NCCL_COMM_TEST("nccl_communicator_ref all_reduce maximum") +MGMN_TEST("nccl_communicator_ref all_reduce maximum", ) { auto streams = nccl_test_util::make_streams(); @@ -112,7 +112,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_reduce maximum") } } -NCCL_COMM_TEST("nccl_communicator_ref reduce sum to root 0") +MGMN_TEST("nccl_communicator_ref reduce sum to root 0", ) { auto streams = nccl_test_util::make_streams(); @@ -150,7 +150,7 @@ NCCL_COMM_TEST("nccl_communicator_ref reduce sum to root 0") REQUIRE_THAT(actual, Equals(expected)); } -NCCL_COMM_TEST("nccl_communicator_ref broadcast from root 0") +MGMN_TEST("nccl_communicator_ref broadcast from root 0", ) { auto streams = nccl_test_util::make_streams(); @@ -192,7 +192,7 @@ NCCL_COMM_TEST("nccl_communicator_ref broadcast from root 0") } } -NCCL_COMM_TEST("nccl_communicator_ref all_gather") +MGMN_TEST("nccl_communicator_ref all_gather", ) { auto streams = nccl_test_util::make_streams(); @@ -241,7 +241,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_gather") } } -NCCL_COMM_TEST("nccl_communicator_ref gather_v to root 0") +MGMN_TEST("nccl_communicator_ref gather_v to root 0", ) { auto streams = nccl_test_util::make_streams(); @@ -308,7 +308,7 @@ NCCL_COMM_TEST("nccl_communicator_ref gather_v to root 0") REQUIRE_THAT(actual, Equals(expected)); } -NCCL_COMM_TEST("nccl_communicator_ref all_to_all_v") +MGMN_TEST("nccl_communicator_ref all_to_all_v", ) { auto streams = nccl_test_util::make_streams(); @@ -379,7 +379,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_to_all_v") #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0) -NCCL_COMM_TEST("nccl_communicator_ref gather to root 0") +MGMN_TEST("nccl_communicator_ref gather to root", ) { auto streams = nccl_test_util::make_streams(); @@ -424,7 +424,7 @@ NCCL_COMM_TEST("nccl_communicator_ref gather to root 0") REQUIRE_THAT(actual, Equals(expected)); } -NCCL_COMM_TEST("nccl_communicator_ref all_to_all") +MGMN_TEST("nccl_communicator_ref all_to_all", ) { auto streams = nccl_test_util::make_streams(); diff --git a/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh b/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh index fc5d1298b30..456fcf6120a 100644 --- a/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh +++ b/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh @@ -99,7 +99,7 @@ private: std::vector wrappers_{}; }; -#define NCCL_COMM_TEST(NAME, ...) \ +#define MGMN_TEST(NAME, ...) \ C2H_TEST_WITH_FIXTURE(::nccl_test_util::nccl_comm_fixture, NAME, "[multi_gpu][nccl]", __VA_ARGS__) } // namespace nccl_test_util diff --git a/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu b/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu index 0f6f2202ed3..c96989c60f4 100644 --- a/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu +++ b/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu @@ -28,7 +28,7 @@ struct payload } // namespace // Ring exchange via send/recv. Rank r contributes {r, r, r}. -NCCL_COMM_TEST("nccl_communicator_ref send/recv ring") +MGMN_TEST("nccl_communicator_ref send/recv ring", ) { if (cuda::devices.size() == 1) { @@ -84,7 +84,7 @@ NCCL_COMM_TEST("nccl_communicator_ref send/recv ring") } } -NCCL_COMM_TEST("nccl_communicator_ref send/recv transports trivially copyable payload") +MGMN_TEST("nccl_communicator_ref send/recv transports trivially copyable payload", ) { if (cuda::devices.size() == 1) { diff --git a/libcudacxx/include/cuda/__container/buffer.h b/libcudacxx/include/cuda/__container/buffer.h index e77f094a7de..059d8f1b44a 100644 --- a/libcudacxx/include/cuda/__container/buffer.h +++ b/libcudacxx/include/cuda/__container/buffer.h @@ -51,6 +51,7 @@ # include # include # include +# include # include # include # include @@ -62,9 +63,19 @@ //! @brief The \c buffer class provides a container of contiguous memory _CCCL_BEGIN_NAMESPACE_CUDA +template +inline constexpr bool __is_any_env = false; + +template +inline constexpr bool __is_any_env<::cuda::std::execution::env<_Tp...>> = true; + +// TODO: +// +// Find a way to enable checks again without bricking for environments that clearly (which we +// can somehow detect) dont intend to have alignment. template inline constexpr bool __buffer_compatible_env = - ::cuda::std::is_same_v<::cuda::std::decay_t<_Env>, ::cuda::std::execution::env<>> + __is_any_env<::cuda::std::remove_cvref_t<_Env>> || ::cuda::std::execution::__queryable_with; _CCCL_BEGIN_NAMESPACE_ABI_VER4_BUMP From b1ece662a8ba0f1bf18bc59423429edbf222a9fb Mon Sep 17 00:00:00 2001 From: Jacob Faibussowitsch Date: Wed, 1 Jul 2026 14:17:44 -0400 Subject: [PATCH 5/7] fixup! Add multi-GPU reduce --- .../__multi_gpu/algorithm/reduce/reduce.h | 15 +- .../algorithms/reduce/range_defaults.cu | 140 ++++++++++++++++++ 2 files changed, 151 insertions(+), 4 deletions(-) create mode 100644 cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h index 778414b6af4..926551546b3 100644 --- a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h +++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h @@ -330,15 +330,22 @@ reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output, //! @param[in] __comm The communicator. //! @param[in] __inputs The input range to reduce. //! @param[out] __output The output iterator receiving the result. -_CCCL_TEMPLATE(class _Comm, class _InputRange, class _OutputIt) +_CCCL_TEMPLATE(class _Comm, + class _InputRange, + class _OutputIt, + class _Tp = ::cuda::std::iter_value_t<_OutputIt>, + class _BinaryOp = ::cuda::std::plus<>) _CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange> - _CCCL_AND ::cuda::std::output_iterator<_OutputIt, ::cuda::std::ranges::range_value_t<_InputRange>>) -_CCCL_HOST_API void reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output) + _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>) +_CCCL_HOST_API void +reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {}) { reduce(::cuda::std::forward<_Comm>(__comm), ::cuda::std::execution::env<>{}, ::cuda::std::forward<_InputRange>(__inputs), - ::cuda::std::forward<_OutputIt>(__output)); + ::cuda::std::forward<_OutputIt>(__output), + ::cuda::std::move(__init), + ::cuda::std::move(__op)); } } // namespace cuda::experimental diff --git a/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu new file mode 100644 index 00000000000..e37ffffbaf2 --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu @@ -0,0 +1,140 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +#include "../../communicators/nccl/nccl_test_helpers.cuh" + +namespace +{ +using value_type = cuda::std::int32_t; + +template +[[nodiscard]] auto make_output_iterators(OutBuffers& out) +{ + std::vector::iterator> outputs; + + outputs.reserve(out.size()); + for (auto& buf : out) + { + outputs.push_back(buf.begin()); + } + return outputs; +} +} // namespace + +MGMN_TEST("reduce, range overload with default initializer and default plus", ) +{ + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (int i = 0; i < comms.size(); ++i) + { + const auto values = {static_cast(comms[i].rank())}; + in.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back(cuda::make_device_buffer( + streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + reference.push_back(static_cast(r)); + } + + auto outputs = make_output_iterators(out); + + cudax::reduce(comms, envs, in, outputs); + + for (auto& stream : streams) + { + stream.sync(); + } + + const value_type expected = std::accumulate(reference.begin(), reference.end(), value_type{}); + + for (const auto& buf : out) + { + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + + REQUIRE(actual.size() == 1); + REQUIRE(actual.front() == expected); + } +} + +MGMN_TEST("reduce, range overload with explicit initializer and default plus", ) +{ + const value_type init{10}; + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (int i = 0; i < comms.size(); ++i) + { + const auto values = {static_cast(comms[i].rank())}; + in.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back(cuda::make_device_buffer( + streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + reference.push_back(static_cast(r)); + } + + auto outputs = make_output_iterators(out); + + cudax::reduce(comms, envs, in, outputs, init); + + for (auto& stream : streams) + { + stream.sync(); + } + + const value_type expected = std::accumulate(reference.begin(), reference.end(), init); + + for (const auto& buf : out) + { + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + + REQUIRE(actual.size() == 1); + REQUIRE(actual.front() == expected); + } +} From f88f027ed1dcf1947e446a75ba65feaf3d52c6be Mon Sep 17 00:00:00 2001 From: Jacob Faibussowitsch Date: Thu, 2 Jul 2026 10:12:03 -0400 Subject: [PATCH 6/7] fixup! Add multi-GPU reduce --- .../__multi_gpu/algorithm/reduce/reduce.h | 24 ++++++++----------- .../cuda/experimental/__multi_gpu/concepts.h | 12 +++++----- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h index 926551546b3..78d49ef4259 100644 --- a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h +++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h @@ -121,14 +121,12 @@ template -void __direct_reduction(_CommRange&& __comms, - const ::std::vector<_PartialType>& __partials, - _RangeOfOutputIt&& __outputs, - const _BinaryOp& __op) +void __direct_reduction( + _CommRange&& __comms, _RangeOfOutputIt&& __outputs, const _BinaryOp& __op, ::std::vector<_PartialType>* __partials) { auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard(); - for (const auto [__comm, __local, __out_it] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs)) + for (auto&& [__comm, __local, __out_it] : ::cuda::std::ranges::views::zip(__comms, *__partials, __outputs)) { __comm.all_reduce( __guard, @@ -142,15 +140,12 @@ void __direct_reduction(_CommRange&& __comms, template void __two_stage_gather_reduction( - _CommRange&& __comms, - const ::std::vector<_PartialType>& __partials, - _RangeOfOutputIt&& __outputs, - const _BinaryOp& __op) + _CommRange&& __comms, _RangeOfOutputIt&& __outputs, const _BinaryOp& __op, ::std::vector<_PartialType>* __partials) { { auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard(); - for (const auto& [__comm, __local] : ::cuda::std::ranges::views::zip(__comms, __partials)) + for (auto&& [__comm, __local] : ::cuda::std::ranges::views::zip(__comms, *__partials)) { auto* const __ptr = __local.__buffer.data(); @@ -158,7 +153,7 @@ void __two_stage_gather_reduction( } } - for (auto&& [__comm, __part, __out] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs)) + for (auto&& [__comm, __part, __out] : ::cuda::std::ranges::views::zip(__comms, *__partials, __outputs)) { auto&& [__buffer, __env, _] = __part; const auto __num_items = __buffer.size(); @@ -265,13 +260,14 @@ _CCCL_HOST_API void reduce( } if constexpr (::cuda::experimental::__has_all_reduce<::cuda::std::ranges::range_value_t<_CommRange>, - typename __properties::__output_type*>) + typename __properties::__output_type*, + _BinaryOp>) { - __direct_reduction(__comms, __partials, __outputs, __op); + __direct_reduction(__comms, __outputs, __op, &__partials); } else { - __two_stage_gather_reduction(__comms, __partials, __outputs, __op); + __two_stage_gather_reduction(__comms, __outputs, __op, &__partials); } } diff --git a/cudax/include/cuda/experimental/__multi_gpu/concepts.h b/cudax/include/cuda/experimental/__multi_gpu/concepts.h index 5b3a9f20e2f..6c94acf7ec2 100644 --- a/cudax/include/cuda/experimental/__multi_gpu/concepts.h +++ b/cudax/include/cuda/experimental/__multi_gpu/concepts.h @@ -80,14 +80,14 @@ _CCCL_CONCEPT __communicator = _CCCL_REQUIRES_EXPR((_Comm), _Comm& __comm)( // Use a typed pointer as default here, since the op may need to instantiated with a // dereferenceable pointer type for reductions -template +template > _CCCL_CONCEPT __has_reduce = _CCCL_REQUIRES_EXPR( - (_Comm, _Ptr), + (_Comm, _Ptr, _Op), _Comm& __comm, _Ptr __sendbuff, _Ptr __recvbuff, ::cuda::std::size_t __count, - ::cuda::std::plus<> __op, + _Op __op, ::cuda::std::int32_t __root, ::cuda::stream_ref __stream)( requires(__communicator<_Comm>), @@ -98,14 +98,14 @@ _CCCL_CONCEPT __has_reduce = _CCCL_REQUIRES_EXPR( // Use a typed pointer as default here, since the op may need to instantiated with a // dereferenceable pointer type for reductions -template +template > _CCCL_CONCEPT __has_all_reduce = _CCCL_REQUIRES_EXPR( - (_Comm, _Ptr), + (_Comm, _Ptr, _Op), _Comm& __comm, _Ptr __sendbuff, _Ptr __recvbuff, ::cuda::std::size_t __count, - ::cuda::std::plus<> __op, + _Op __op, ::cuda::stream_ref __stream)( requires(__communicator<_Comm>), _Same_as(void) __comm.all_reduce( From b9b17664a74f6737ac9a11dbed6fbce3d1f43662 Mon Sep 17 00:00:00 2001 From: Jacob Faibussowitsch Date: Thu, 2 Jul 2026 14:43:02 -0400 Subject: [PATCH 7/7] fixup! Add multi-GPU reduce --- c2h/include/c2h/catch2_test_helper.h | 29 +++++- c2h/include/c2h/vector.h | 2 + .../__multi_gpu/algorithm/reduce/reduce.h | 54 +++++++--- .../test/multi_gpu/algorithms/reduce/basic.cu | 98 +++++++++++++------ .../algorithms/reduce/range_defaults.cu | 16 ++- 5 files changed, 137 insertions(+), 62 deletions(-) diff --git a/c2h/include/c2h/catch2_test_helper.h b/c2h/include/c2h/catch2_test_helper.h index 923fe077319..9c60d84ff6a 100644 --- a/c2h/include/c2h/catch2_test_helper.h +++ b/c2h/include/c2h/catch2_test_helper.h @@ -5,6 +5,7 @@ #include +#include #include #include #include @@ -360,8 +361,8 @@ struct vector_compare_result_t std::optional>> last_mismatches; }; -template -auto compare_vectors(const host_vector& actual, const host_vector& expected) -> vector_compare_result_t +template +auto compare_host_ranges(const LhsRange& actual, const RhsRange& expected) -> vector_compare_result_t { constexpr size_t good_values_before_mismatch = 3; constexpr size_t first_mismatches_count = 5; @@ -411,6 +412,12 @@ auto compare_vectors(const host_vector& actual, const host_vector& expecte return result; } +template +auto compare_vectors(const host_vector& actual, const host_vector& expected) -> vector_compare_result_t +{ + return compare_host_ranges(actual, expected); +} + template auto compare_vectors(const device_vector& actual, const device_vector& expected) -> vector_compare_result_t { @@ -421,10 +428,12 @@ template auto compare_vectors(const cuda::buffer& actual, const cuda::buffer& expected) -> vector_compare_result_t { + const auto actual_host = cuda::make_buffer(actual.stream(), cuda::mr::legacy_pinned_memory_resource{}, actual); + const auto expected_host = cuda::make_buffer(expected.stream(), cuda::mr::legacy_pinned_memory_resource{}, expected); + actual.stream().sync(); expected.stream().sync(); - return compare_vectors(host_vector(actual.begin(), actual.end()), - host_vector(expected.begin(), expected.end())); + return compare_host_ranges(actual_host, expected_host); } template @@ -559,6 +568,18 @@ ::std::ostream& operator<<(::std::ostream& os, const tuple& tup) } _CCCL_END_NAMESPACE_CUDA_STD +_CCCL_BEGIN_NAMESPACE_CUDA +template +::std::ostream& operator<<(::std::ostream& os, const cuda::buffer& buffer) +{ + const auto host_buf = cuda::make_buffer(buffer.stream(), cuda::mr::legacy_pinned_memory_resource{}, buffer); + + buffer.stream().sync(); + os << ::Catch::Detail::stringify(::std::vector{host_buf.begin(), host_buf.end()}); + return os; +} +_CCCL_END_NAMESPACE_CUDA + template <> struct Catch::StringMaker { diff --git a/c2h/include/c2h/vector.h b/c2h/include/c2h/vector.h index d26cfa6a206..0329a2ddaca 100644 --- a/c2h/include/c2h/vector.h +++ b/c2h/include/c2h/vector.h @@ -5,6 +5,8 @@ #include +#include +#include #include #include diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h index 78d49ef4259..6650508a1bc 100644 --- a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h +++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -69,16 +70,13 @@ template , - "Provided memory resource must be device accessible"); - - const auto __num_items = ::cuda::std::ranges::size(__inputs); + const auto __num_items = ::cuda::std::ranges::size(__inputs); // Allocate enough storage so that we can use the buffer directly in an in-place comm all // gather/all reduce call. Those calls require that the receive buffer is of size nranks * // sendcount. @@ -114,9 +112,10 @@ template ()) { static_assert(::cuda::std::ranges::sized_range<_CommRange>); @@ -256,18 +261,18 @@ _CCCL_HOST_API void reduce( { __partials.emplace_back( ::cuda::experimental::__detail::__reduce::__local_reduction( - /*__ROOT_RANK=*/0, __comm, __env, __inputs, __init, __op)); + /*__ROOT_RANK=*/0, __comm, __env, __inputs, __init, __ident, __op)); } if constexpr (::cuda::experimental::__has_all_reduce<::cuda::std::ranges::range_value_t<_CommRange>, typename __properties::__output_type*, _BinaryOp>) { - __direct_reduction(__comms, __outputs, __op, &__partials); + ::cuda::experimental::__detail::__reduce::__direct_reduction(__comms, __outputs, __op, &__partials); } else { - __two_stage_gather_reduction(__comms, __outputs, __op, &__partials); + ::cuda::experimental::__detail::__reduce::__two_stage_gather_reduction(__comms, __outputs, __op, &__partials); } } @@ -293,6 +298,7 @@ _CCCL_HOST_API void reduce( //! @param[out] __output The output iterator receiving the result. //! @param[in] __init The initial value seeding the reduction. //! @param[in] __op The binary reduction operator. +//! @param[in] __ident The identity element to be used in case of empty ranges. _CCCL_TEMPLATE(class _Comm, class _Env, class _InputRange, @@ -301,15 +307,22 @@ _CCCL_TEMPLATE(class _Comm, class _BinaryOp = ::cuda::std::plus<>) _CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange> _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>) -_CCCL_HOST_API void -reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {}) +_CCCL_HOST_API void reduce( + _Comm&& __comm, + _Env&& __env, + _InputRange&& __inputs, + _OutputIt __output, + _Tp __init = {}, + _BinaryOp __op = {}, + _Tp __ident = ::cuda::identity_element<_BinaryOp, _Tp>()) { reduce(::cuda::std::span<::cuda::std::remove_reference_t<_Comm>, 1>{::cuda::std::addressof(__comm), 1}, ::cuda::std::span<::cuda::std::remove_reference_t<_Env>, 1>{::cuda::std::addressof(__env), 1}, ::cuda::std::span<::cuda::std::remove_reference_t<_InputRange>, 1>{::cuda::std::addressof(__inputs), 1}, ::cuda::std::span<::cuda::std::remove_reference_t<_OutputIt>, 1>{::cuda::std::addressof(__output), 1}, ::cuda::std::move(__init), - ::cuda::std::move(__op)); + ::cuda::std::move(__op), + ::cuda::std::move(__ident)); } //! @brief Reduce a single input range over a single communicator using @@ -326,6 +339,9 @@ reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output, //! @param[in] __comm The communicator. //! @param[in] __inputs The input range to reduce. //! @param[out] __output The output iterator receiving the result. +//! @param[in] __init The initial value seeding the reduction. +//! @param[in] __op The binary reduction operator. +//! @param[in] __ident The identity element to be used in case of empty ranges. _CCCL_TEMPLATE(class _Comm, class _InputRange, class _OutputIt, @@ -333,15 +349,21 @@ _CCCL_TEMPLATE(class _Comm, class _BinaryOp = ::cuda::std::plus<>) _CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange> _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>) -_CCCL_HOST_API void -reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {}) +_CCCL_HOST_API void reduce( + _Comm&& __comm, + _InputRange&& __inputs, + _OutputIt __output, + _Tp __init = {}, + _BinaryOp __op = {}, + _Tp __ident = ::cuda::identity_element<_BinaryOp, _Tp>()) { reduce(::cuda::std::forward<_Comm>(__comm), ::cuda::std::execution::env<>{}, ::cuda::std::forward<_InputRange>(__inputs), ::cuda::std::forward<_OutputIt>(__output), ::cuda::std::move(__init), - ::cuda::std::move(__op)); + ::cuda::std::move(__op), + ::cuda::std::move(__ident)); } } // namespace cuda::experimental diff --git a/cudax/test/multi_gpu/algorithms/reduce/basic.cu b/cudax/test/multi_gpu/algorithms/reduce/basic.cu index babffa8ff50..0c5f998bc14 100644 --- a/cudax/test/multi_gpu/algorithms/reduce/basic.cu +++ b/cudax/test/multi_gpu/algorithms/reduce/basic.cu @@ -24,6 +24,7 @@ #include #include "../../communicators/nccl/nccl_test_helpers.cuh" +#include namespace { @@ -39,7 +40,7 @@ T make_value(int i) template <> custom_value make_value<>(int i) { - return {static_cast(i), static_cast(i)}; + return {{static_cast(i), static_cast(i)}}; }; // A custom reduction operator equivalent to `cuda::std::plus<>`. @@ -52,6 +53,42 @@ struct custom_plus } }; +template +struct identity_value; // Unsupported combinations fail to compile. + +template +struct identity_value> +{ + [[nodiscard]] static T get() + { + return make_value(0); + } +}; + +template +struct identity_value +{ + [[nodiscard]] static T get() + { + return make_value(0); + } +}; + +template +struct identity_value> +{ + [[nodiscard]] static T get() + { + return cuda::std::numeric_limits::lowest(); + } +}; + +template +[[nodiscard]] T get_identity() +{ + return identity_value::get(); +} + using value_types = c2h::type_list; using operators = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>, custom_plus>; @@ -73,7 +110,8 @@ template // Run the full reduction, wait for it to finish, and check that `reduce` left its argument ranges // untouched. This boilerplate is identical for every test regardless of how the inputs are shaped. template -void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init, Op op, Streams& streams) +void do_reduce( + Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init, const T& ident, Op op, Streams& streams) { using value_type = typename cuda::std::remove_cvref_t::value_type; @@ -90,7 +128,7 @@ void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init const auto outputs_copy = outputs; - cudax::reduce(comms, envs, in, outputs, init, op); + cudax::reduce(comms, envs, in, outputs, init, op, ident); for (auto& stream : streams) { @@ -103,7 +141,7 @@ void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init const auto actual = cuda::make_buffer(in[i].stream(), pool, in[i]); REQUIRE_THAT(actual, Equals(in_copy[i])); } - REQUIRE(outputs == outputs_copy); + REQUIRE_THAT(outputs, Catch::Matchers::Equals(outputs_copy)); } } // namespace @@ -114,7 +152,8 @@ MGMN_TEST("reduce, one element per rank", value_types, operators) // Seed each reduction with a few hardcoded initializers. The init participates in the fold the // same way on host and device, so any value works for every operator under test. - const T init = make_value(GENERATE(0, 1, -1, 5)); + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); auto comms = this->communicators(); auto streams = nccl_test_util::make_streams(); @@ -131,7 +170,7 @@ MGMN_TEST("reduce, one element per rank", value_types, operators) in.reserve(comms.size()); out.reserve(comms.size()); envs.reserve(comms.size()); - for (int i = 0; i < comms.size(); ++i) + for (cuda::std::size_t i = 0; i < comms.size(); ++i) { const auto values = {make_value(comms[i].rank())}; in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); @@ -146,17 +185,15 @@ MGMN_TEST("reduce, one element per rank", value_types, operators) auto outputs = make_output_iterators(out); - do_reduce(comms, envs, in, outputs, init, Op{}, streams); + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); for (const auto& buf : out) { - auto pool = cuda::mr::legacy_pinned_memory_resource{}; - const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); - REQUIRE(actual.size() == 1); - REQUIRE(actual.front() == expected); + REQUIRE_THAT(buf, Equals(exp)); } } @@ -167,7 +204,8 @@ MGMN_TEST("reduce, multiple elements per rank", value_types, operators) // Seed each reduction with a few hardcoded initializers. The init participates in the fold the // same way on host and device, so any value works for every operator under test. - const T init = make_value(GENERATE(0, 1, -1, 5)); + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); auto comms = this->communicators(); auto streams = nccl_test_util::make_streams(); @@ -184,7 +222,7 @@ MGMN_TEST("reduce, multiple elements per rank", value_types, operators) in.reserve(comms.size()); out.reserve(comms.size()); envs.reserve(comms.size()); - for (int i = 0; i < comms.size(); ++i) + for (cuda::std::size_t i = 0; i < comms.size(); ++i) { const auto v = make_value(comms[i].rank()); const auto values = {v, v, v}; @@ -201,17 +239,15 @@ MGMN_TEST("reduce, multiple elements per rank", value_types, operators) auto outputs = make_output_iterators(out); - do_reduce(comms, envs, in, outputs, init, Op{}, streams); + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); for (const auto& buf : out) { - auto pool = cuda::mr::legacy_pinned_memory_resource{}; - const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); - REQUIRE(actual.size() == 1); - REQUIRE(actual.front() == expected); + REQUIRE_THAT(buf, Equals(exp)); } } @@ -220,7 +256,8 @@ MGMN_TEST("reduce, some ranks empty", value_types, operators) using T = c2h::get<0, TestType>; using Op = c2h::get<1, TestType>; - const T init = make_value(GENERATE(0, 1, -1, 5)); + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); auto comms = this->communicators(); auto streams = nccl_test_util::make_streams(); @@ -237,7 +274,7 @@ MGMN_TEST("reduce, some ranks empty", value_types, operators) in.reserve(comms.size()); out.reserve(comms.size()); envs.reserve(comms.size()); - for (int i = 0; i < comms.size(); ++i) + for (cuda::std::size_t i = 0; i < comms.size(); ++i) { const auto rank = comms[i].rank(); if (rank % 2 == 0) @@ -262,17 +299,15 @@ MGMN_TEST("reduce, some ranks empty", value_types, operators) auto outputs = make_output_iterators(out); - do_reduce(comms, envs, in, outputs, init, Op{}, streams); + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); for (const auto& buf : out) { - auto pool = cuda::mr::legacy_pinned_memory_resource{}; - const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); - REQUIRE(actual.size() == 1); - REQUIRE(actual.front() == expected); + REQUIRE_THAT(buf, Equals(exp)); } } @@ -281,7 +316,8 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators) using T = c2h::get<0, TestType>; using Op = c2h::get<1, TestType>; - const T init = make_value(GENERATE(0, 1, -1, 5)); + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); auto comms = this->communicators(); auto streams = nccl_test_util::make_streams(); @@ -295,7 +331,7 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators) in.reserve(comms.size()); out.reserve(comms.size()); envs.reserve(comms.size()); - for (int i = 0; i < comms.size(); ++i) + for (cuda::std::size_t i = 0; i < comms.size(); ++i) { in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device())); out.emplace_back( @@ -305,7 +341,7 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators) auto outputs = make_output_iterators(out); - do_reduce(comms, envs, in, outputs, init, Op{}, streams); + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); // Reducing nothing seeded by `init` yields `init`, exactly like `std::accumulate` over an empty // range. @@ -313,10 +349,8 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators) for (const auto& buf : out) { - auto pool = cuda::mr::legacy_pinned_memory_resource{}; - const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); - REQUIRE(actual.size() == 1); - REQUIRE(actual.front() == expected); + REQUIRE_THAT(buf, Equals(exp)); } } diff --git a/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu index e37ffffbaf2..1c8dbca81dd 100644 --- a/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu +++ b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu @@ -54,7 +54,7 @@ MGMN_TEST("reduce, range overload with default initializer and default plus", ) in.reserve(comms.size()); out.reserve(comms.size()); envs.reserve(comms.size()); - for (int i = 0; i < comms.size(); ++i) + for (cuda::std::size_t i = 0; i < comms.size(); ++i) { const auto values = {static_cast(comms[i].rank())}; in.emplace_back( @@ -81,11 +81,9 @@ MGMN_TEST("reduce, range overload with default initializer and default plus", ) for (const auto& buf : out) { - auto pool = cuda::mr::legacy_pinned_memory_resource{}; - const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); - REQUIRE(actual.size() == 1); - REQUIRE(actual.front() == expected); + REQUIRE_THAT(buf, Equals(exp)); } } @@ -104,7 +102,7 @@ MGMN_TEST("reduce, range overload with explicit initializer and default plus", ) in.reserve(comms.size()); out.reserve(comms.size()); envs.reserve(comms.size()); - for (int i = 0; i < comms.size(); ++i) + for (cuda::std::size_t i = 0; i < comms.size(); ++i) { const auto values = {static_cast(comms[i].rank())}; in.emplace_back( @@ -131,10 +129,8 @@ MGMN_TEST("reduce, range overload with explicit initializer and default plus", ) for (const auto& buf : out) { - auto pool = cuda::mr::legacy_pinned_memory_resource{}; - const auto actual = cuda::make_buffer(buf.stream(), pool, buf); + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); - REQUIRE(actual.size() == 1); - REQUIRE(actual.front() == expected); + REQUIRE_THAT(buf, Equals(exp)); } }