diff --git a/c2h/include/c2h/catch2_test_helper.h b/c2h/include/c2h/catch2_test_helper.h index 923fe077319..9c60d84ff6a 100644 --- a/c2h/include/c2h/catch2_test_helper.h +++ b/c2h/include/c2h/catch2_test_helper.h @@ -5,6 +5,7 @@ #include +#include #include #include #include @@ -360,8 +361,8 @@ struct vector_compare_result_t std::optional>> last_mismatches; }; -template -auto compare_vectors(const host_vector& actual, const host_vector& expected) -> vector_compare_result_t +template +auto compare_host_ranges(const LhsRange& actual, const RhsRange& expected) -> vector_compare_result_t { constexpr size_t good_values_before_mismatch = 3; constexpr size_t first_mismatches_count = 5; @@ -411,6 +412,12 @@ auto compare_vectors(const host_vector& actual, const host_vector& expecte return result; } +template +auto compare_vectors(const host_vector& actual, const host_vector& expected) -> vector_compare_result_t +{ + return compare_host_ranges(actual, expected); +} + template auto compare_vectors(const device_vector& actual, const device_vector& expected) -> vector_compare_result_t { @@ -421,10 +428,12 @@ template auto compare_vectors(const cuda::buffer& actual, const cuda::buffer& expected) -> vector_compare_result_t { + const auto actual_host = cuda::make_buffer(actual.stream(), cuda::mr::legacy_pinned_memory_resource{}, actual); + const auto expected_host = cuda::make_buffer(expected.stream(), cuda::mr::legacy_pinned_memory_resource{}, expected); + actual.stream().sync(); expected.stream().sync(); - return compare_vectors(host_vector(actual.begin(), actual.end()), - host_vector(expected.begin(), expected.end())); + return compare_host_ranges(actual_host, expected_host); } template @@ -559,6 +568,18 @@ ::std::ostream& operator<<(::std::ostream& os, const tuple& tup) } _CCCL_END_NAMESPACE_CUDA_STD +_CCCL_BEGIN_NAMESPACE_CUDA +template +::std::ostream& operator<<(::std::ostream& os, const cuda::buffer& buffer) +{ + const auto host_buf = cuda::make_buffer(buffer.stream(), cuda::mr::legacy_pinned_memory_resource{}, buffer); + + buffer.stream().sync(); + os << ::Catch::Detail::stringify(::std::vector{host_buf.begin(), host_buf.end()}); + return os; +} +_CCCL_END_NAMESPACE_CUDA + template <> struct Catch::StringMaker { diff --git a/c2h/include/c2h/vector.h b/c2h/include/c2h/vector.h index d26cfa6a206..0329a2ddaca 100644 --- a/c2h/include/c2h/vector.h +++ b/c2h/include/c2h/vector.h @@ -5,6 +5,8 @@ #include +#include +#include #include #include diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 42079bcbf73..f5904cb8e2e 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -160,8 +160,8 @@ private: EnvT env) { using offset_t = detail::choose_offset_t; - using accum_t = ::cuda::std:: - __accumulator_t>, T>; + using accum_t = decltype(detail::reduce::template select_accum_t( + static_cast(nullptr))); if constexpr (Determinism == ::cuda::execution::determinism::__determinism_t::__gpu_to_gpu) { @@ -216,14 +216,16 @@ private: } //! @brief Internal implementation shared by Reduce and TransformReduce env overloads - template + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ReductionOpT, + typename TransformOpT, + typename T, + typename NumItemsT, + typename EnvT, + typename AccumT = decltype(detail::reduce::template select_accum_t( + static_cast(nullptr)))> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __transform_reduce( InputIteratorT d_in, OutputIteratorT d_out, @@ -543,8 +545,7 @@ public: InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce"); - using accum_t = ::cuda::std::__accumulator_t, T>; - return __transform_reduce(d_in, d_out, num_items, reduction_op, ::cuda::std::identity{}, init, env); + return __transform_reduce(d_in, d_out, num_items, reduction_op, ::cuda::std::identity{}, init, env); } //! @rst @@ -619,9 +620,8 @@ public: { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum"); using OutputT = cub::detail::non_void_value_t>; - using accum_t = ::cuda::std::__accumulator_t<::cuda::std::plus<>, cub::detail::it_value_t, OutputT>; - return __transform_reduce( - d_in, d_out, num_items, ::cuda::std::plus<>{}, ::cuda::std::identity{}, OutputT{}, env); + + return __transform_reduce(d_in, d_out, num_items, ::cuda::std::plus<>{}, ::cuda::std::identity{}, OutputT{}, env); } //! @rst @@ -2211,9 +2211,7 @@ public: EnvT env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::TransformReduce"); - using accum_t = ::cuda::std:: - __accumulator_t>, T>; - return __transform_reduce(d_in, d_out, num_items, reduction_op, transform_op, init, env); + return __transform_reduce(d_in, d_out, num_items, reduction_op, transform_op, init, env); } //! @rst diff --git a/cub/test/catch2_test_device_reduce_env_api.cu b/cub/test/catch2_test_device_reduce_env_api.cu index 47eaef43c91..c32e78900a6 100644 --- a/cub/test/catch2_test_device_reduce_env_api.cu +++ b/cub/test/catch2_test_device_reduce_env_api.cu @@ -656,3 +656,15 @@ C2H_TEST("cub::DeviceReduce::Sum queries both stream and resource from composed REQUIRE(bytes_allocated > 0); REQUIRE(bytes_deallocated == bytes_allocated); } + +C2H_TEST("cub::DeviceReduce::Reduce allows no_init in env overloads", "[reduce][env]") +{ + auto input = thrust::device_vector{1, 2, 3, 4, 5}; + auto output = thrust::device_vector(1); + + auto error = cub::DeviceReduce::Reduce( + input.begin(), output.begin(), static_cast(input.size()), cuda::std::plus<>{}, cub::detail::reduce::no_init); + + REQUIRE(error == cudaSuccess); + REQUIRE(output[0] == 15); +} diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h new file mode 100644 index 00000000000..b85367cbd04 --- /dev/null +++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h @@ -0,0 +1,155 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H +#define _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// NOLINTBEGIN(bugprone-reserved-identifier) + +namespace cuda::experimental::__detail +{ +#define __CUDAX_MULTI_GPU_DISPATCH(__logical_device, __count, __call, __arguments) \ + do \ + { \ + const auto __cur_context = ::cuda::__ensure_current_context{(__logical_device).context()}; \ + auto __status = ::cudaError_t{}; \ + \ + THRUST_INDEX_TYPE_DISPATCH(__status, __call, __count, __arguments); \ + THRUST_NS_QUALIFIER::cuda_cub::throw_on_error(__status, /*msg=*/"performing " #__call #__arguments); \ + } while (0) + +#define __CUDAX_MULTI_GPU_DOUBLE_DISPATCH(__logical_device, __count1, __count2, __call, __arguments) \ + do \ + { \ + const auto __cur_context = ::cuda::__ensure_current_context{(__logical_device).context()}; \ + auto __status = ::cudaError_t{}; \ + \ + THRUST_DOUBLE_INDEX_TYPE_DISPATCH(__status, __call, __count1, __count2, __arguments); \ + THRUST_NS_QUALIFIER::cuda_cub::throw_on_error(__status, /*msg=*/"performing " #__call #__arguments); \ + } while (0) + +template +[[nodiscard]] _CCCL_HOST_API constexpr ::cuda::stream_ref __stream_from_env(const _Env& __env) +{ + return ::cuda::__lazy_call_or( + ::cuda::get_stream, + [] { + return ::cuda::stream_ref{::CUstream{}}; + }, + __env); +} + +template +[[nodiscard]] _CCCL_HOST_API constexpr decltype(auto) __resource_from_env(const _Env& __env, ::cuda::device_ref __device) +{ + return ::cuda::__lazy_call_or( + ::cuda::mr::get_memory_resource, + [&] { + return ::cuda::device_default_memory_pool(__device); + }, + __env); +} + +template +[[nodiscard]] _CCCL_HOST_API constexpr decltype(auto) __sanitize_buffer_env(const _Env& __env) +{ + if constexpr (::cuda::__buffer_compatible_env<_Env>) + { + return __env; + } + else + { + return ::cuda::std::execution::env<>{}; + } +} + +template +[[nodiscard]] _CCCL_HOST_API constexpr auto __make_safe_uninitialized_buffer( + ::cuda::stream_ref __stream, _Resource&& __resource, ::cuda::std::size_t __size, const _Env& __env) +{ + if constexpr (::cuda::std::is_trivially_constructible_v<_Tp>) + { + return ::cuda::make_buffer<_Tp>( + __stream, ::cuda::std::forward<_Resource>(__resource), __size, ::cuda::no_init, __sanitize_buffer_env(__env)); + } + else + { + return ::cuda::make_buffer<_Tp>( + __stream, ::cuda::std::forward<_Resource>(__resource), __size, _Tp{}, __sanitize_buffer_env(__env)); + } +} + +template +struct __in_range_out_it_properties +{ + using __input_type = ::cuda::std::ranges::range_value_t<::cuda::std::ranges::range_reference_t<_InputRangeOfRanges>>; + using __output_type = ::cuda::std::iter_value_t<::cuda::std::ranges::range_reference_t<_RangeOfOutputIt>>; + + using __env_type = ::cuda::std::ranges::range_value_t<_EnvRange>; + + using __resource_type = ::cuda::std::remove_cvref_t(), ::cuda::std::declval<::cuda::device_ref>()))>; + + using __buffer_type = ::cuda::__buffer_type_for_props<__output_type, typename __resource_type::default_queries>; +}; + +template +_CCCL_CONCEPT __range_of_sized_random_access_ranges = _CCCL_REQUIRES_EXPR((_RangeOfRanges), )( + requires(::cuda::std::ranges::forward_range<_RangeOfRanges>), + requires(::cuda::std::ranges::sized_range<_RangeOfRanges>), + requires(::cuda::std::ranges::random_access_range<::cuda::std::ranges::range_reference_t<_RangeOfRanges>>)); + +template +_CCCL_CONCEPT __range_of_output_iters = _CCCL_REQUIRES_EXPR((_RangeOfIters, _Tp), )( + requires(::cuda::std::ranges::forward_range<_RangeOfIters>), + requires( + ::cuda::std::output_iterator<::cuda::std::remove_cvref_t<::cuda::std::ranges::range_reference_t<_RangeOfIters>>, + _Tp>)); +} // namespace cuda::experimental::__detail + +// NOLINTEND(bugprone-reserved-identifier) + +#include + +#endif // _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h new file mode 100644 index 00000000000..6650508a1bc --- /dev/null +++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h @@ -0,0 +1,374 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H +#define _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +// NOLINTBEGIN(bugprone-reserved-identifier) + +namespace cuda::experimental +{ +namespace __detail::__reduce +{ +template +struct __partial_redop +{ + using __buffer_type = _Buffer; + using __env_type = _Env; + + _Buffer __buffer; + _Env __env; + ::cuda::stream_ref __stream; +}; + +template +[[nodiscard]] _CCCL_HOST_API __partial_redop<_Buffer, _Env> __local_reduction( + const ::cuda::std::int32_t __ROOT_RANK, + _Comm&& __comm, + _Env __env, + _InputRange&& __inputs, + const _Tp& __init, + const _Tp& __ident, + _BinaryOp __op) +{ + const auto& __logical_device = __comm.logical_device(); + auto __stream = __stream_from_env(__env); + auto __resource = __resource_from_env(__env, __logical_device.underlying_device()); + const auto __num_items = ::cuda::std::ranges::size(__inputs); + // Allocate enough storage so that we can use the buffer directly in an in-place comm all + // gather/all reduce call. Those calls require that the receive buffer is of size nranks * + // sendcount. + auto __buff = __make_safe_uninitialized_buffer<_Tp>(__stream, __resource, __comm.size(), __env); + static_assert(::cuda::std::same_as); + + if (const auto __rank = __comm.rank(); __rank == __ROOT_RANK) + { + __CUDAX_MULTI_GPU_DISPATCH( + __logical_device, + __num_items, + CUB_NS_QUALIFIER::DeviceReduce::Reduce, + (::cuda::std::ranges::begin(__inputs), + // Similarly to above, prepare for the comm calls later. In order for those to be + // in-place, the sendbuff = recvbuff + rank, so we need to place our partial result + // there + __buff.begin() + __rank, + __num_items_fixed, + ::cuda::std::move(__op), + __init, + __env)); + } + else + { + __CUDAX_MULTI_GPU_DISPATCH( + __logical_device, + __num_items, + CUB_NS_QUALIFIER::DeviceReduce::Reduce, + (::cuda::std::ranges::begin(__inputs), + // Similarly to above, prepare for the com calls later. In order for those to be + // in-place, the sendbuff = recvbuff + rank, so we need to place our partial result + // there + __buff.begin() + __rank, + __num_items_fixed, + ::cuda::std::move(__op), + __ident, + __env)); + } + + return {::cuda::std::move(__buff), ::cuda::std::move(__env), __stream}; +} + +template +void __direct_reduction( + _CommRange&& __comms, _RangeOfOutputIt&& __outputs, const _BinaryOp& __op, ::std::vector<_PartialType>* __partials) +{ + auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard(); + + for (auto&& [__comm, __local, __out_it] : ::cuda::std::ranges::views::zip(__comms, *__partials, __outputs)) + { + __comm.all_reduce( + __guard, + __local.__buffer.data() + __comm.rank(), + ::cuda::std::to_address(__out_it), + /*__count=*/1, + __op, + __local.__stream); + } +} + +template +void __two_stage_gather_reduction( + _CommRange&& __comms, _RangeOfOutputIt&& __outputs, const _BinaryOp& __op, ::std::vector<_PartialType>* __partials) +{ + { + auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard(); + + for (auto&& [__comm, __local] : ::cuda::std::ranges::views::zip(__comms, *__partials)) + { + auto* const __ptr = __local.__buffer.data(); + + __comm.all_gather(__guard, __ptr + __comm.rank(), __ptr, /*__count=*/1, __local.__stream); + } + } + + for (auto&& [__comm, __part, __out] : ::cuda::std::ranges::views::zip(__comms, *__partials, __outputs)) + { + auto&& [__buffer, __env, _] = __part; + const auto __num_items = __buffer.size(); + + __CUDAX_MULTI_GPU_DISPATCH( + __comm.logical_device(), + __num_items, + CUB_NS_QUALIFIER::DeviceReduce::Reduce, + (__buffer.begin(), __out, __num_items_fixed, __op, CUB_NS_QUALIFIER::detail::reduce::no_init, __env)); + } +} +} // namespace __detail::__reduce + +//! @brief Reduce each input range over its communicator and write one result per output +//! iterator. +//! +//! Performs one reduction per communicator in parallel across devices. The communicators, +//! environments, input ranges and output iterators are iterated in lockstep, so for the +//! i-th element of each range the i-th input range is reduced with `__op` seeded by +//! `__init` on the i-th communicator's devices, the partial results are combined across +//! all ranks of that communicator, and the final value is written through the i-th output +//! iterator. +//! +//! This routine is used when the current thread or process owns multiple local GPUs. For +//! example, consider a scenario where there are 8 GPUs and 4 processes such that each +//! process owns 2 GPUs. Then the user would call this routine on each process, passing in both +//! local arrays: +//! @code{.cpp} +//! device_buffer gpu_0_data = ...; +//! device_buffer gpu_1_data = ...; +//! +//! cudax::reduce({comm0, comm1}, +//! {env_0, env_1}, +//! {gpu_0_data, gpu_1_data}, +//! {out_0, out_1}, +//! ...) +//! @endcode +//! +//! All ranges must have the same length. The algorithm will cap iteration to the shortest +//! length, but this should not be relied upon and may change at any time, for any reason. So +//! differing lengths is effectively undefined behavior. +//! +//! After this call returns, all local output iterators will hold the same value. In that sense +//! this routine is similar to an "all reduce". +//! +//! The identity element should survive reduction with any other value, returning the original +//! value unchanged. For example, for integers/foats and plus, the identity element is 0. For +//! maximum and minimum, the identity values are INT_MIN, and INT_MAX respectively. +//! +//! @tparam _CommRange The range of communicators. Each element must model the communicator +//! concept. +//! @tparam _EnvRange The range of execution environments. Each environment supplies the +//! stream and memory resource used for its communicator. +//! @tparam _InputRangeOfRanges The range whose elements are the per-communicator input +//! ranges. Each element must be a sized random-access range. +//! @tparam _RangeOfOutputIt The range of output iterators, one per communicator. +//! @tparam _Tp The reduction and result value type. Deduced by default from the output +//! element type. +//! @tparam _BinaryOp The binary reduction operator type. Defaults to `::cuda::std::plus<>`. +//! +//! @param[in] __comms The range of communicators. +//! @param[in] __envs The range of execution environments. +//! @param[in] __range_of_inputs The range of per-communicator input ranges to reduce. +//! @param[out] __outputs The range of output iterators receiving the per-communicator results. +//! @param[in] __init The initial value seeding each reduction. +//! @param[in] __op The binary reduction operator. +//! @param[in] __ident The identity element to be used in case of empty ranges. +_CCCL_TEMPLATE(class _CommRange, + class _EnvRange, + class _InputRangeOfRanges, + class _RangeOfOutputIt, + class _Tp = ::cuda::std::iter_value_t<::cuda::std::ranges::range_reference_t<_RangeOfOutputIt>>, + class _BinaryOp = ::cuda::std::plus<>) +_CCCL_REQUIRES(__range_of_communicators<_CommRange> _CCCL_AND ::cuda::std::ranges::forward_range<_EnvRange> _CCCL_AND + __detail::__range_of_sized_random_access_ranges<_InputRangeOfRanges> _CCCL_AND + __detail::__range_of_output_iters<_RangeOfOutputIt, _Tp>) +_CCCL_HOST_API void reduce( + _CommRange&& __comms, + _EnvRange&& __envs, + _InputRangeOfRanges&& __range_of_inputs, + _RangeOfOutputIt&& __outputs, + _Tp __init = {}, + _BinaryOp __op = {}, + _Tp __ident = ::cuda::identity_element<_BinaryOp, _Tp>()) +{ + static_assert(::cuda::std::ranges::sized_range<_CommRange>); + + using __properties = + ::cuda::experimental::__detail::__in_range_out_it_properties<_InputRangeOfRanges, _RangeOfOutputIt, _EnvRange>; + using __partial_type = ::cuda::experimental::__detail::__reduce::__partial_redop; + + const auto __num_local = ::cuda::std::ranges::size(__comms); + + if (!__num_local) + { + return; + } + + auto __partials = ::std::vector<__partial_type>{}; + + __partials.reserve(__num_local); + // TODO(jfaibussowit): can just be ranges::zip | ranges::transform | ranges::to() (and then + // we don't need to do the env, and buffer type deduction upfront) + for (auto&& [__comm, __env, __inputs] : ::cuda::std::ranges::views::zip(__comms, __envs, __range_of_inputs)) + { + __partials.emplace_back( + ::cuda::experimental::__detail::__reduce::__local_reduction( + /*__ROOT_RANK=*/0, __comm, __env, __inputs, __init, __ident, __op)); + } + + if constexpr (::cuda::experimental::__has_all_reduce<::cuda::std::ranges::range_value_t<_CommRange>, + typename __properties::__output_type*, + _BinaryOp>) + { + ::cuda::experimental::__detail::__reduce::__direct_reduction(__comms, __outputs, __op, &__partials); + } + else + { + ::cuda::experimental::__detail::__reduce::__two_stage_gather_reduction(__comms, __outputs, __op, &__partials); + } +} + +//! @brief Reduce a single input range over a single communicator using the given execution +//! environment. +//! +//! Convenience wrapper that forwards a single `(communicator, environment, input range, +//! output iterator)` to the range-based overload. The input range is reduced with `__op` +//! seeded by `__init` across the communicator's ranks and the final value is written +//! through `__output`. +//! +//! @tparam _Comm The communicator type. Must model the communicator concept. +//! @tparam _Env The execution environment type. Supplies the stream and memory resource. +//! @tparam _InputRange The input range type. Must be a random-access range. +//! @tparam _OutputIt The output iterator type. +//! @tparam _Tp The reduction and result value type. Deduced by default from the output +//! value type. +//! @tparam _BinaryOp The binary reduction operator type. Defaults to `::cuda::std::plus<>`. +//! +//! @param[in] __comm The communicator. +//! @param[in] __env The execution environment. +//! @param[in] __inputs The input range to reduce. +//! @param[out] __output The output iterator receiving the result. +//! @param[in] __init The initial value seeding the reduction. +//! @param[in] __op The binary reduction operator. +//! @param[in] __ident The identity element to be used in case of empty ranges. +_CCCL_TEMPLATE(class _Comm, + class _Env, + class _InputRange, + class _OutputIt, + class _Tp = ::cuda::std::iter_value_t<_OutputIt>, + class _BinaryOp = ::cuda::std::plus<>) +_CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange> + _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>) +_CCCL_HOST_API void reduce( + _Comm&& __comm, + _Env&& __env, + _InputRange&& __inputs, + _OutputIt __output, + _Tp __init = {}, + _BinaryOp __op = {}, + _Tp __ident = ::cuda::identity_element<_BinaryOp, _Tp>()) +{ + reduce(::cuda::std::span<::cuda::std::remove_reference_t<_Comm>, 1>{::cuda::std::addressof(__comm), 1}, + ::cuda::std::span<::cuda::std::remove_reference_t<_Env>, 1>{::cuda::std::addressof(__env), 1}, + ::cuda::std::span<::cuda::std::remove_reference_t<_InputRange>, 1>{::cuda::std::addressof(__inputs), 1}, + ::cuda::std::span<::cuda::std::remove_reference_t<_OutputIt>, 1>{::cuda::std::addressof(__output), 1}, + ::cuda::std::move(__init), + ::cuda::std::move(__op), + ::cuda::std::move(__ident)); +} + +//! @brief Reduce a single input range over a single communicator using +//! `::cuda::std::plus<>` and a default-constructed initial value. +//! +//! Convenience wrapper that supplies a default execution environment and a default +//! `::cuda::std::plus<>` reduction, forwarding to the environment-taking overload. The +//! result value type is the input range's value type. +//! +//! @tparam _Comm The communicator type. Must model the communicator concept. +//! @tparam _InputRange The input range type. Must be a random-access range. +//! @tparam _OutputIt The output iterator type. +//! +//! @param[in] __comm The communicator. +//! @param[in] __inputs The input range to reduce. +//! @param[out] __output The output iterator receiving the result. +//! @param[in] __init The initial value seeding the reduction. +//! @param[in] __op The binary reduction operator. +//! @param[in] __ident The identity element to be used in case of empty ranges. +_CCCL_TEMPLATE(class _Comm, + class _InputRange, + class _OutputIt, + class _Tp = ::cuda::std::iter_value_t<_OutputIt>, + class _BinaryOp = ::cuda::std::plus<>) +_CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange> + _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>) +_CCCL_HOST_API void reduce( + _Comm&& __comm, + _InputRange&& __inputs, + _OutputIt __output, + _Tp __init = {}, + _BinaryOp __op = {}, + _Tp __ident = ::cuda::identity_element<_BinaryOp, _Tp>()) +{ + reduce(::cuda::std::forward<_Comm>(__comm), + ::cuda::std::execution::env<>{}, + ::cuda::std::forward<_InputRange>(__inputs), + ::cuda::std::forward<_OutputIt>(__output), + ::cuda::std::move(__init), + ::cuda::std::move(__op), + ::cuda::std::move(__ident)); +} +} // namespace cuda::experimental + +// NOLINTEND(bugprone-reserved-identifier) + +#include + +#endif // _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H diff --git a/cudax/include/cuda/experimental/__multi_gpu/concepts.h b/cudax/include/cuda/experimental/__multi_gpu/concepts.h index 2d805553aee..6c94acf7ec2 100644 --- a/cudax/include/cuda/experimental/__multi_gpu/concepts.h +++ b/cudax/include/cuda/experimental/__multi_gpu/concepts.h @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include @@ -78,14 +80,14 @@ _CCCL_CONCEPT __communicator = _CCCL_REQUIRES_EXPR((_Comm), _Comm& __comm)( // Use a typed pointer as default here, since the op may need to instantiated with a // dereferenceable pointer type for reductions -template +template > _CCCL_CONCEPT __has_reduce = _CCCL_REQUIRES_EXPR( - (_Comm, _Ptr), + (_Comm, _Ptr, _Op), _Comm& __comm, _Ptr __sendbuff, _Ptr __recvbuff, ::cuda::std::size_t __count, - ::cuda::std::plus<> __op, + _Op __op, ::cuda::std::int32_t __root, ::cuda::stream_ref __stream)( requires(__communicator<_Comm>), @@ -96,14 +98,14 @@ _CCCL_CONCEPT __has_reduce = _CCCL_REQUIRES_EXPR( // Use a typed pointer as default here, since the op may need to instantiated with a // dereferenceable pointer type for reductions -template +template > _CCCL_CONCEPT __has_all_reduce = _CCCL_REQUIRES_EXPR( - (_Comm, _Ptr), + (_Comm, _Ptr, _Op), _Comm& __comm, _Ptr __sendbuff, _Ptr __recvbuff, ::cuda::std::size_t __count, - ::cuda::std::plus<> __op, + _Op __op, ::cuda::stream_ref __stream)( requires(__communicator<_Comm>), _Same_as(void) __comm.all_reduce( @@ -214,6 +216,18 @@ _CCCL_CONCEPT __has_all_to_all_v = _CCCL_REQUIRES_EXPR( __recv_counts, __recv_displs, __stream)); + +// ========================================================================================== + +template +_CCCL_CONCEPT __range_of_communicators = _CCCL_REQUIRES_EXPR((_Range), )( + requires(::cuda::std::ranges::forward_range<_Range>), + requires(__communicator<::cuda::std::remove_cvref_t<::cuda::std::ranges::range_reference_t<_Range>>>)); + +template +_CCCL_CONCEPT __range_of_streams = _CCCL_REQUIRES_EXPR((_Range), )( + requires(::cuda::std::ranges::forward_range<_Range>), + requires(::cuda::std::ranges::__container_compatible_range<_Range, ::cuda::stream_ref>)); } // namespace cuda::experimental // NOLINTEND(bugprone-reserved-identifier) diff --git a/cudax/test/multi_gpu/CMakeLists.txt b/cudax/test/multi_gpu/CMakeLists.txt index 0bef8b2bc2e..f617ecf5e3b 100644 --- a/cudax/test/multi_gpu/CMakeLists.txt +++ b/cudax/test/multi_gpu/CMakeLists.txt @@ -63,6 +63,7 @@ function(cudax_add_multi_gpu_test sub_prefix target_name_var source) set(${target_name_var} ${test_target} PARENT_SCOPE) endfunction() +add_subdirectory(algorithms) add_subdirectory(communicators) add_subdirectory(concepts) add_subdirectory(nccl) diff --git a/cudax/test/multi_gpu/algorithms/CMakeLists.txt b/cudax/test/multi_gpu/algorithms/CMakeLists.txt new file mode 100644 index 00000000000..279f970b1af --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/CMakeLists.txt @@ -0,0 +1,11 @@ +#===----------------------------------------------------------------------===## +# +# Part of CUDA Experimental in CUDA C++ Core Libraries, +# under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +# +#===----------------------------------------------------------------------===## + +add_subdirectory(reduce) diff --git a/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt b/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt new file mode 100644 index 00000000000..1a43a0be4f8 --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt @@ -0,0 +1,20 @@ +#===----------------------------------------------------------------------===## +# +# Part of CUDA Experimental in CUDA C++ Core Libraries, +# under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +# +#===----------------------------------------------------------------------===## + +if (NOT cudax_ENABLE_NCCL) + return() +endif() + +file(GLOB test_srcs LIST_DIRECTORIES FALSE CONFIGURE_DEPENDS *.cu *.cpp) + +foreach (src IN LISTS test_srcs) + cudax_add_multi_gpu_test("algorithms.reduce" test_target "${src}") + target_link_libraries(${test_target} PRIVATE NCCL::nccl) +endforeach() diff --git a/cudax/test/multi_gpu/algorithms/reduce/basic.cu b/cudax/test/multi_gpu/algorithms/reduce/basic.cu new file mode 100644 index 00000000000..0c5f998bc14 --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/reduce/basic.cu @@ -0,0 +1,356 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +#include "../../communicators/nccl/nccl_test_helpers.cuh" +#include + +namespace +{ +using custom_value = c2h::custom_type_t; +static_assert(cudax::nccl_transportable); + +template +T make_value(int i) +{ + return static_cast(i); +} + +template <> +custom_value make_value<>(int i) +{ + return {{static_cast(i), static_cast(i)}}; +}; + +// A custom reduction operator equivalent to `cuda::std::plus<>`. +struct custom_plus +{ + template + _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const + { + return lhs + rhs; + } +}; + +template +struct identity_value; // Unsupported combinations fail to compile. + +template +struct identity_value> +{ + [[nodiscard]] static T get() + { + return make_value(0); + } +}; + +template +struct identity_value +{ + [[nodiscard]] static T get() + { + return make_value(0); + } +}; + +template +struct identity_value> +{ + [[nodiscard]] static T get() + { + return cuda::std::numeric_limits::lowest(); + } +}; + +template +[[nodiscard]] T get_identity() +{ + return identity_value::get(); +} + +using value_types = c2h::type_list; +using operators = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>, custom_plus>; + +// One output iterator per local output buffer. Collected after `out` is fully built so the +// iterators do not dangle across reallocations. +template +[[nodiscard]] auto make_output_iterators(OutBuffers& out) +{ + std::vector::iterator> outputs; + + outputs.reserve(out.size()); + for (auto& buf : out) + { + outputs.push_back(buf.begin()); + } + return outputs; +} + +// Run the full reduction, wait for it to finish, and check that `reduce` left its argument ranges +// untouched. This boilerplate is identical for every test regardless of how the inputs are shaped. +template +void do_reduce( + Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init, const T& ident, Op op, Streams& streams) +{ + using value_type = typename cuda::std::remove_cvref_t::value_type; + + // cuda::std::execution::env has no operator==, so we can only compare the sizes. + const auto envs_size = envs.size(); + + auto pool = cuda::mr::legacy_pinned_memory_resource{}; + std::vector> in_copy; + in_copy.reserve(in.size()); + for (const auto& buf : in) + { + in_copy.emplace_back(cuda::make_buffer(buf.stream(), pool, buf)); + } + + const auto outputs_copy = outputs; + + cudax::reduce(comms, envs, in, outputs, init, op, ident); + + for (auto& stream : streams) + { + stream.sync(); + } + + REQUIRE(envs.size() == envs_size); + for (cuda::std::size_t i = 0; i < in.size(); ++i) + { + const auto actual = cuda::make_buffer(in[i].stream(), pool, in[i]); + REQUIRE_THAT(actual, Equals(in_copy[i])); + } + REQUIRE_THAT(outputs, Catch::Matchers::Equals(outputs_copy)); +} +} // namespace + +MGMN_TEST("reduce, one element per rank", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + // Seed each reduction with a few hardcoded initializers. The init participates in the fold the + // same way on host and device, so any value works for every operator under test. + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // Global rank `comms[i].rank()` contributes the single value `rank`. Each local rank also gets a + // one-element output buffer and an environment carrying its stream, so the reduction is + // stream-ordered on the correct device. `reference` mirrors the contributions of every global + // rank so we can fold them on the host exactly like `reduce` does on the device. + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (cuda::std::size_t i = 0; i < comms.size(); ++i) + { + const auto values = {make_value(comms[i].rank())}; + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + reference.push_back(make_value(r)); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); + + const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); + + for (const auto& buf : out) + { + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); + + REQUIRE_THAT(buf, Equals(exp)); + } +} + +MGMN_TEST("reduce, multiple elements per rank", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + // Seed each reduction with a few hardcoded initializers. The init participates in the fold the + // same way on host and device, so any value works for every operator under test. + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // Global rank `comms[i].rank()` contributes `{rank, rank, rank}`. `reduce` first does a local CUB + // reduction of each rank's range, then combines the partials across ranks. Each local rank also + // gets a one-element output buffer and an environment carrying its stream. `reference` mirrors + // every global rank's three contributions for the host-side fold. + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (cuda::std::size_t i = 0; i < comms.size(); ++i) + { + const auto v = make_value(comms[i].rank()); + const auto values = {v, v, v}; + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + const auto v = make_value(r); + reference.insert(reference.end(), {v, v, v}); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); + + const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); + + for (const auto& buf : out) + { + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); + + REQUIRE_THAT(buf, Equals(exp)); + } +} + +MGMN_TEST("reduce, some ranks empty", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // Even global ranks contribute two copies of `rank`; odd global ranks contribute an empty input + // range. Rank 0 (the reduction root) is always non-empty. `reduce` must treat an empty rank as + // contributing nothing, exactly like `std::accumulate` over the surviving elements. `reference` + // mirrors that for the host-side fold. + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (cuda::std::size_t i = 0; i < comms.size(); ++i) + { + const auto rank = comms[i].rank(); + if (rank % 2 == 0) + { + const auto values = {make_value(rank), make_value(rank)}; + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + } + else + { + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device())); + } + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + + for (int r = 0; r < comms.front().size(); r += 2) + { + reference.push_back(make_value(r)); + reference.push_back(make_value(r)); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); + + const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{}); + + for (const auto& buf : out) + { + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); + + REQUIRE_THAT(buf, Equals(exp)); + } +} + +MGMN_TEST("reduce, all ranks empty", value_types, operators) +{ + using T = c2h::get<0, TestType>; + using Op = c2h::get<1, TestType>; + + const T init = make_value(GENERATE(0, 1, -1, 5)); + const auto ident = get_identity(); + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + // No rank contributes any element. Reducing nothing seeded by `init` is just `init`, so every + // output must equal `init` regardless of the operator. + std::vector> in; + std::vector> out; + std::vector envs; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (cuda::std::size_t i = 0; i < comms.size(); ++i) + { + in.emplace_back(cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device())); + out.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + + auto outputs = make_output_iterators(out); + + do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams); + + // Reducing nothing seeded by `init` yields `init`, exactly like `std::accumulate` over an empty + // range. + const T expected = init; + + for (const auto& buf : out) + { + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); + + REQUIRE_THAT(buf, Equals(exp)); + } +} diff --git a/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu new file mode 100644 index 00000000000..1c8dbca81dd --- /dev/null +++ b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +#include "../../communicators/nccl/nccl_test_helpers.cuh" + +namespace +{ +using value_type = cuda::std::int32_t; + +template +[[nodiscard]] auto make_output_iterators(OutBuffers& out) +{ + std::vector::iterator> outputs; + + outputs.reserve(out.size()); + for (auto& buf : out) + { + outputs.push_back(buf.begin()); + } + return outputs; +} +} // namespace + +MGMN_TEST("reduce, range overload with default initializer and default plus", ) +{ + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (cuda::std::size_t i = 0; i < comms.size(); ++i) + { + const auto values = {static_cast(comms[i].rank())}; + in.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back(cuda::make_device_buffer( + streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + reference.push_back(static_cast(r)); + } + + auto outputs = make_output_iterators(out); + + cudax::reduce(comms, envs, in, outputs); + + for (auto& stream : streams) + { + stream.sync(); + } + + const value_type expected = std::accumulate(reference.begin(), reference.end(), value_type{}); + + for (const auto& buf : out) + { + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); + + REQUIRE_THAT(buf, Equals(exp)); + } +} + +MGMN_TEST("reduce, range overload with explicit initializer and default plus", ) +{ + const value_type init{10}; + + auto comms = this->communicators(); + auto streams = nccl_test_util::make_streams(); + + std::vector> in; + std::vector> out; + std::vector envs; + std::vector reference; + + in.reserve(comms.size()); + out.reserve(comms.size()); + envs.reserve(comms.size()); + for (cuda::std::size_t i = 0; i < comms.size(); ++i) + { + const auto values = {static_cast(comms[i].rank())}; + in.emplace_back( + cuda::make_device_buffer(streams[i], comms[i].logical_device().underlying_device(), values)); + out.emplace_back(cuda::make_device_buffer( + streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init)); + envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}}); + } + for (int r = 0; r < comms.front().size(); ++r) + { + reference.push_back(static_cast(r)); + } + + auto outputs = make_output_iterators(out); + + cudax::reduce(comms, envs, in, outputs, init); + + for (auto& stream : streams) + { + stream.sync(); + } + + const value_type expected = std::accumulate(reference.begin(), reference.end(), init); + + for (const auto& buf : out) + { + const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected); + + REQUIRE_THAT(buf, Equals(exp)); + } +} diff --git a/cudax/test/multi_gpu/communicators/nccl/basic.cu b/cudax/test/multi_gpu/communicators/nccl/basic.cu index 1ba084b8adf..f21f209e18b 100644 --- a/cudax/test/multi_gpu/communicators/nccl/basic.cu +++ b/cudax/test/multi_gpu/communicators/nccl/basic.cu @@ -32,7 +32,7 @@ C2H_TEST("nccl_communicator_ref not constructible from NCCL_COMM_NULL", "[multi_ STATIC_REQUIRE(!::cuda::std::is_constructible_v); } -NCCL_COMM_TEST("nccl_communicator_ref basic") +MGMN_TEST("nccl_communicator_ref basic", ) { SECTION("rank and size") { diff --git a/cudax/test/multi_gpu/communicators/nccl/collective.cu b/cudax/test/multi_gpu/communicators/nccl/collective.cu index 92a1402f721..96f73edfa21 100644 --- a/cudax/test/multi_gpu/communicators/nccl/collective.cu +++ b/cudax/test/multi_gpu/communicators/nccl/collective.cu @@ -26,7 +26,7 @@ namespace constexpr cuda::std::int32_t ROOT_RANK = 0; } // namespace -NCCL_COMM_TEST("nccl_communicator_ref all_reduce sum") +MGMN_TEST("nccl_communicator_ref all_reduce sum", ) { auto streams = nccl_test_util::make_streams(); @@ -69,7 +69,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_reduce sum") } } -NCCL_COMM_TEST("nccl_communicator_ref all_reduce maximum") +MGMN_TEST("nccl_communicator_ref all_reduce maximum", ) { auto streams = nccl_test_util::make_streams(); @@ -112,7 +112,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_reduce maximum") } } -NCCL_COMM_TEST("nccl_communicator_ref reduce sum to root 0") +MGMN_TEST("nccl_communicator_ref reduce sum to root 0", ) { auto streams = nccl_test_util::make_streams(); @@ -150,7 +150,7 @@ NCCL_COMM_TEST("nccl_communicator_ref reduce sum to root 0") REQUIRE_THAT(actual, Equals(expected)); } -NCCL_COMM_TEST("nccl_communicator_ref broadcast from root 0") +MGMN_TEST("nccl_communicator_ref broadcast from root 0", ) { auto streams = nccl_test_util::make_streams(); @@ -192,7 +192,7 @@ NCCL_COMM_TEST("nccl_communicator_ref broadcast from root 0") } } -NCCL_COMM_TEST("nccl_communicator_ref all_gather") +MGMN_TEST("nccl_communicator_ref all_gather", ) { auto streams = nccl_test_util::make_streams(); @@ -241,7 +241,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_gather") } } -NCCL_COMM_TEST("nccl_communicator_ref gather_v to root 0") +MGMN_TEST("nccl_communicator_ref gather_v to root 0", ) { auto streams = nccl_test_util::make_streams(); @@ -308,7 +308,7 @@ NCCL_COMM_TEST("nccl_communicator_ref gather_v to root 0") REQUIRE_THAT(actual, Equals(expected)); } -NCCL_COMM_TEST("nccl_communicator_ref all_to_all_v") +MGMN_TEST("nccl_communicator_ref all_to_all_v", ) { auto streams = nccl_test_util::make_streams(); @@ -379,7 +379,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_to_all_v") #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0) -NCCL_COMM_TEST("nccl_communicator_ref gather to root 0") +MGMN_TEST("nccl_communicator_ref gather to root", ) { auto streams = nccl_test_util::make_streams(); @@ -424,7 +424,7 @@ NCCL_COMM_TEST("nccl_communicator_ref gather to root 0") REQUIRE_THAT(actual, Equals(expected)); } -NCCL_COMM_TEST("nccl_communicator_ref all_to_all") +MGMN_TEST("nccl_communicator_ref all_to_all", ) { auto streams = nccl_test_util::make_streams(); diff --git a/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh b/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh index fc5d1298b30..456fcf6120a 100644 --- a/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh +++ b/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh @@ -99,7 +99,7 @@ private: std::vector wrappers_{}; }; -#define NCCL_COMM_TEST(NAME, ...) \ +#define MGMN_TEST(NAME, ...) \ C2H_TEST_WITH_FIXTURE(::nccl_test_util::nccl_comm_fixture, NAME, "[multi_gpu][nccl]", __VA_ARGS__) } // namespace nccl_test_util diff --git a/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu b/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu index 0f6f2202ed3..c96989c60f4 100644 --- a/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu +++ b/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu @@ -28,7 +28,7 @@ struct payload } // namespace // Ring exchange via send/recv. Rank r contributes {r, r, r}. -NCCL_COMM_TEST("nccl_communicator_ref send/recv ring") +MGMN_TEST("nccl_communicator_ref send/recv ring", ) { if (cuda::devices.size() == 1) { @@ -84,7 +84,7 @@ NCCL_COMM_TEST("nccl_communicator_ref send/recv ring") } } -NCCL_COMM_TEST("nccl_communicator_ref send/recv transports trivially copyable payload") +MGMN_TEST("nccl_communicator_ref send/recv transports trivially copyable payload", ) { if (cuda::devices.size() == 1) { diff --git a/libcudacxx/include/cuda/__container/buffer.h b/libcudacxx/include/cuda/__container/buffer.h index e77f094a7de..059d8f1b44a 100644 --- a/libcudacxx/include/cuda/__container/buffer.h +++ b/libcudacxx/include/cuda/__container/buffer.h @@ -51,6 +51,7 @@ # include # include # include +# include # include # include # include @@ -62,9 +63,19 @@ //! @brief The \c buffer class provides a container of contiguous memory _CCCL_BEGIN_NAMESPACE_CUDA +template +inline constexpr bool __is_any_env = false; + +template +inline constexpr bool __is_any_env<::cuda::std::execution::env<_Tp...>> = true; + +// TODO: +// +// Find a way to enable checks again without bricking for environments that clearly (which we +// can somehow detect) dont intend to have alignment. template inline constexpr bool __buffer_compatible_env = - ::cuda::std::is_same_v<::cuda::std::decay_t<_Env>, ::cuda::std::execution::env<>> + __is_any_env<::cuda::std::remove_cvref_t<_Env>> || ::cuda::std::execution::__queryable_with; _CCCL_BEGIN_NAMESPACE_ABI_VER4_BUMP