From 203bd06bef2860586bce6bb6ced4fc4218827609 Mon Sep 17 00:00:00 2001
From: Jacob Faibussowitsch <jacob.fai@gmail.com>
Date: Wed, 1 Jul 2026 15:00:19 -0400
Subject: [PATCH 1/7] Fix CUB DeviceReduce env overloads to accept no_init_t

---
 cub/cub/device/device_reduce.cuh | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)
diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 42079bcbf73..306f6d83bb2 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -216,14 +216,16 @@ private:
   }
 
   //! @brief Internal implementation shared by Reduce and TransformReduce env overloads
-  template <typename AccumT,
-            typename InputIteratorT,
-            typename OutputIteratorT,
-            typename ReductionOpT,
-            typename TransformOpT,
-            typename T,
-            typename NumItemsT,
-            typename EnvT>
+  template <
+    typename InputIteratorT,
+    typename OutputIteratorT,
+    typename ReductionOpT,
+    typename TransformOpT,
+    typename T,
+    typename NumItemsT,
+    typename EnvT,
+    typename AccumT = decltype(detail::reduce::template select_accum_t<InputIteratorT, T, ReductionOpT, TransformOpT>(
+      static_cast<detail::use_default*>(nullptr)))>
   [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __transform_reduce(
     InputIteratorT d_in,
     OutputIteratorT d_out,
@@ -543,8 +545,7 @@ public:
     InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce");
-    using accum_t = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
-    return __transform_reduce<accum_t>(d_in, d_out, num_items, reduction_op, ::cuda::std::identity{}, init, env);
+    return __transform_reduce(d_in, d_out, num_items, reduction_op, ::cuda::std::identity{}, init, env);
   }
 
   //! @rst
@@ -619,9 +620,8 @@ public:
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Sum");
     using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
-    using accum_t = ::cuda::std::__accumulator_t<::cuda::std::plus<>, cub::detail::it_value_t<InputIteratorT>, OutputT>;
-    return __transform_reduce<accum_t>(
-      d_in, d_out, num_items, ::cuda::std::plus<>{}, ::cuda::std::identity{}, OutputT{}, env);
+
+    return __transform_reduce(d_in, d_out, num_items, ::cuda::std::plus<>{}, ::cuda::std::identity{}, OutputT{}, env);
   }
 
   //! @rst
@@ -2211,9 +2211,7 @@ public:
     EnvT env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::TransformReduce");
-    using accum_t = ::cuda::std::
-      __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
-    return __transform_reduce<accum_t>(d_in, d_out, num_items, reduction_op, transform_op, init, env);
+    return __transform_reduce(d_in, d_out, num_items, reduction_op, transform_op, init, env);
   }
 
   //! @rst

From 50265981e1848596f73b1c2893a60841106dac8e Mon Sep 17 00:00:00 2001
From: Jacob Faibussowitsch <jacob.fai@gmail.com>
Date: Thu, 2 Jul 2026 09:48:43 -0400
Subject: [PATCH 2/7] fixup! Fix CUB DeviceReduce env overloads to accept
 no_init_t

---
 cub/test/catch2_test_device_reduce_env_api.cu | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cub/test/catch2_test_device_reduce_env_api.cu b/cub/test/catch2_test_device_reduce_env_api.cu
index 47eaef43c91..c32e78900a6 100644
--- a/cub/test/catch2_test_device_reduce_env_api.cu
+++ b/cub/test/catch2_test_device_reduce_env_api.cu
@@ -656,3 +656,15 @@ C2H_TEST("cub::DeviceReduce::Sum queries both stream and resource from composed
   REQUIRE(bytes_allocated > 0);
   REQUIRE(bytes_deallocated == bytes_allocated);
 }
+
+C2H_TEST("cub::DeviceReduce::Reduce allows no_init in env overloads", "[reduce][env]")
+{
+  auto input  = thrust::device_vector<int>{1, 2, 3, 4, 5};
+  auto output = thrust::device_vector<int>(1);
+
+  auto error = cub::DeviceReduce::Reduce(
+    input.begin(), output.begin(), static_cast<int>(input.size()), cuda::std::plus<>{}, cub::detail::reduce::no_init);
+
+  REQUIRE(error == cudaSuccess);
+  REQUIRE(output[0] == 15);
+}

From 0c4f9a90322eae0a9ce433ba44ae8b77f93387fb Mon Sep 17 00:00:00 2001
From: Jacob Faibussowitsch <jacob.fai@gmail.com>
Date: Thu, 2 Jul 2026 10:16:34 -0400
Subject: [PATCH 3/7] fixup! Fix CUB DeviceReduce env overloads to accept
 no_init_t

---
 cub/cub/device/device_reduce.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh
index 306f6d83bb2..f5904cb8e2e 100644
--- a/cub/cub/device/device_reduce.cuh
+++ b/cub/cub/device/device_reduce.cuh
@@ -160,8 +160,8 @@ private:
     EnvT env)
   {
     using offset_t = detail::choose_offset_t<NumItemsT>;
-    using accum_t  = ::cuda::std::
-      __accumulator_t<ReductionOpT, ::cuda::std::invoke_result_t<TransformOpT, detail::it_value_t<InputIteratorT>>, T>;
+    using accum_t  = decltype(detail::reduce::template select_accum_t<InputIteratorT, T, ReductionOpT, TransformOpT>(
+      static_cast<detail::use_default*>(nullptr)));
 
     if constexpr (Determinism == ::cuda::execution::determinism::__determinism_t::__gpu_to_gpu)
     {

From 3712e5e147d2af15a8b347c279cbf40d64aeb807 Mon Sep 17 00:00:00 2001
From: Jacob Faibussowitsch <jacob.fai@gmail.com>
Date: Fri, 12 Jun 2026 11:51:20 -0400
Subject: [PATCH 4/7] Add multi-GPU reduce

---
 .../__multi_gpu/algorithm/common.h            | 155 ++++++++
 .../__multi_gpu/algorithm/reduce/reduce.h     | 349 ++++++++++++++++++
 .../cuda/experimental/__multi_gpu/concepts.h  |  14 +
 cudax/test/multi_gpu/CMakeLists.txt           |   1 +
 .../test/multi_gpu/algorithms/CMakeLists.txt  |  11 +
 .../algorithms/reduce/CMakeLists.txt          |  20 +
 .../test/multi_gpu/algorithms/reduce/basic.cu | 322 ++++++++++++++++
 .../multi_gpu/communicators/nccl/basic.cu     |   2 +-
 .../communicators/nccl/collective.cu          |  18 +-
 .../communicators/nccl/nccl_test_helpers.cuh  |   2 +-
 .../communicators/nccl/peer_to_peer.cu        |   4 +-
 libcudacxx/include/cuda/__container/buffer.h  |  13 +-
 12 files changed, 897 insertions(+), 14 deletions(-)
 create mode 100644 cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h
 create mode 100644 cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
 create mode 100644 cudax/test/multi_gpu/algorithms/CMakeLists.txt
 create mode 100644 cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt
 create mode 100644 cudax/test/multi_gpu/algorithms/reduce/basic.cu

diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h
new file mode 100644
index 00000000000..b85367cbd04
--- /dev/null
+++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/common.h
@@ -0,0 +1,155 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H
+#define _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <cuda/__container/buffer.h>
+#include <cuda/__device/device_ref.h>
+#include <cuda/__functional/lazy_call_or.h>
+#include <cuda/__memory_pool/device_memory_pool.h>
+#include <cuda/__memory_resource/get_memory_resource.h>
+#include <cuda/__runtime/ensure_current_context.h>
+#include <cuda/__stream/get_stream.h>
+#include <cuda/__stream/stream_ref.h>
+#include <cuda/__utility/no_init.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__iterator/concepts.h>
+#include <cuda/std/__iterator/readable_traits.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__type_traits/is_trivially_constructible.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
+#include <cuda/std/__utility/declval.h>
+#include <cuda/std/__utility/forward.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+// NOLINTBEGIN(bugprone-reserved-identifier)
+
+namespace cuda::experimental::__detail
+{
+#define __CUDAX_MULTI_GPU_DISPATCH(__logical_device, __count, __call, __arguments)                       \
+  do                                                                                                     \
+  {                                                                                                      \
+    const auto __cur_context = ::cuda::__ensure_current_context{(__logical_device).context()};           \
+    auto __status            = ::cudaError_t{};                                                          \
+                                                                                                         \
+    THRUST_INDEX_TYPE_DISPATCH(__status, __call, __count, __arguments);                                  \
+    THRUST_NS_QUALIFIER::cuda_cub::throw_on_error(__status, /*msg=*/"performing " #__call #__arguments); \
+  } while (0)
+
+#define __CUDAX_MULTI_GPU_DOUBLE_DISPATCH(__logical_device, __count1, __count2, __call, __arguments)     \
+  do                                                                                                     \
+  {                                                                                                      \
+    const auto __cur_context = ::cuda::__ensure_current_context{(__logical_device).context()};           \
+    auto __status            = ::cudaError_t{};                                                          \
+                                                                                                         \
+    THRUST_DOUBLE_INDEX_TYPE_DISPATCH(__status, __call, __count1, __count2, __arguments);                \
+    THRUST_NS_QUALIFIER::cuda_cub::throw_on_error(__status, /*msg=*/"performing " #__call #__arguments); \
+  } while (0)
+
+template <class _Env>
+[[nodiscard]] _CCCL_HOST_API constexpr ::cuda::stream_ref __stream_from_env(const _Env& __env)
+{
+  return ::cuda::__lazy_call_or(
+    ::cuda::get_stream,
+    [] {
+      return ::cuda::stream_ref{::CUstream{}};
+    },
+    __env);
+}
+
+template <class _Env>
+[[nodiscard]] _CCCL_HOST_API constexpr decltype(auto) __resource_from_env(const _Env& __env, ::cuda::device_ref __device)
+{
+  return ::cuda::__lazy_call_or(
+    ::cuda::mr::get_memory_resource,
+    [&] {
+      return ::cuda::device_default_memory_pool(__device);
+    },
+    __env);
+}
+
+template <typename _Env>
+[[nodiscard]] _CCCL_HOST_API constexpr decltype(auto) __sanitize_buffer_env(const _Env& __env)
+{
+  if constexpr (::cuda::__buffer_compatible_env<_Env>)
+  {
+    return __env;
+  }
+  else
+  {
+    return ::cuda::std::execution::env<>{};
+  }
+}
+
+template <class _Tp, class _Resource, class _Env>
+[[nodiscard]] _CCCL_HOST_API constexpr auto __make_safe_uninitialized_buffer(
+  ::cuda::stream_ref __stream, _Resource&& __resource, ::cuda::std::size_t __size, const _Env& __env)
+{
+  if constexpr (::cuda::std::is_trivially_constructible_v<_Tp>)
+  {
+    return ::cuda::make_buffer<_Tp>(
+      __stream, ::cuda::std::forward<_Resource>(__resource), __size, ::cuda::no_init, __sanitize_buffer_env(__env));
+  }
+  else
+  {
+    return ::cuda::make_buffer<_Tp>(
+      __stream, ::cuda::std::forward<_Resource>(__resource), __size, _Tp{}, __sanitize_buffer_env(__env));
+  }
+}
+
+template <class _InputRangeOfRanges, class _RangeOfOutputIt, class _EnvRange>
+struct __in_range_out_it_properties
+{
+  using __input_type  = ::cuda::std::ranges::range_value_t<::cuda::std::ranges::range_reference_t<_InputRangeOfRanges>>;
+  using __output_type = ::cuda::std::iter_value_t<::cuda::std::ranges::range_reference_t<_RangeOfOutputIt>>;
+
+  using __env_type = ::cuda::std::ranges::range_value_t<_EnvRange>;
+
+  using __resource_type = ::cuda::std::remove_cvref_t<decltype(::cuda::experimental::__detail::__resource_from_env(
+    ::cuda::std::declval<__env_type>(), ::cuda::std::declval<::cuda::device_ref>()))>;
+
+  using __buffer_type = ::cuda::__buffer_type_for_props<__output_type, typename __resource_type::default_queries>;
+};
+
+template <class _RangeOfRanges>
+_CCCL_CONCEPT __range_of_sized_random_access_ranges = _CCCL_REQUIRES_EXPR((_RangeOfRanges), )(
+  requires(::cuda::std::ranges::forward_range<_RangeOfRanges>),
+  requires(::cuda::std::ranges::sized_range<_RangeOfRanges>),
+  requires(::cuda::std::ranges::random_access_range<::cuda::std::ranges::range_reference_t<_RangeOfRanges>>));
+
+template <class _RangeOfIters, class _Tp>
+_CCCL_CONCEPT __range_of_output_iters = _CCCL_REQUIRES_EXPR((_RangeOfIters, _Tp), )(
+  requires(::cuda::std::ranges::forward_range<_RangeOfIters>),
+  requires(
+    ::cuda::std::output_iterator<::cuda::std::remove_cvref_t<::cuda::std::ranges::range_reference_t<_RangeOfIters>>,
+                                 _Tp>));
+} // namespace cuda::experimental::__detail
+
+// NOLINTEND(bugprone-reserved-identifier)
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_EXPERIMENTAL___MULTI_GPU_ALGORITHM_COMMON_H
diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
new file mode 100644
index 00000000000..778414b6af4
--- /dev/null
+++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
@@ -0,0 +1,349 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H
+#define _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/dispatch/kernels/kernel_reduce.cuh>
+
+#include <cuda/__memory_resource/resource.h>
+#include <cuda/__runtime/ensure_current_context.h>
+#include <cuda/__stream/get_stream.h>
+#include <cuda/__stream/stream_ref.h>
+#include <cuda/std/__concepts/same_as.h>
+#include <cuda/std/__iterator/readable_traits.h>
+#include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__ranges/size.h>
+#include <cuda/std/__ranges/zip_view.h>
+#include <cuda/std/__utility/move.h>
+#include <cuda/std/span>
+
+#include <cuda/experimental/__multi_gpu/algorithm/common.h>
+#include <cuda/experimental/__multi_gpu/concepts.h>
+
+#include <vector>
+
+#include <cuda/std/__cccl/prologue.h>
+
+// NOLINTBEGIN(bugprone-reserved-identifier)
+
+namespace cuda::experimental
+{
+namespace __detail::__reduce
+{
+template <class _Buffer, class _Env>
+struct __partial_redop
+{
+  using __buffer_type = _Buffer;
+  using __env_type    = _Env;
+
+  _Buffer __buffer;
+  _Env __env;
+  ::cuda::stream_ref __stream;
+};
+
+template <class _Buffer, class _Comm, class _Env, class _InputRange, class _Tp, class _BinaryOp>
+[[nodiscard]] _CCCL_HOST_API __partial_redop<_Buffer, _Env> __local_reduction(
+  const ::cuda::std::int32_t __ROOT_RANK,
+  _Comm&& __comm,
+  _Env __env,
+  _InputRange&& __inputs,
+  const _Tp& __init,
+  _BinaryOp __op)
+{
+  const auto& __logical_device = __comm.logical_device();
+  auto __stream                = __stream_from_env(__env);
+  auto __resource              = __resource_from_env(__env, __logical_device.underlying_device());
+
+  static_assert(::cuda::mr::resource_with<decltype(__resource), ::cuda::mr::device_accessible>,
+                "Provided memory resource must be device accessible");
+
+  const auto __num_items = ::cuda::std::ranges::size(__inputs);
+  // Allocate enough storage so that we can use the buffer directly in an in-place comm all
+  // gather/all reduce call. Those calls require that the receive buffer is of size nranks *
+  // sendcount.
+  auto __buff = __make_safe_uninitialized_buffer<_Tp>(__stream, __resource, __comm.size(), __env);
+  static_assert(::cuda::std::same_as<decltype(__buff), _Buffer>);
+
+  if (const auto __rank = __comm.rank(); __rank == __ROOT_RANK)
+  {
+    __CUDAX_MULTI_GPU_DISPATCH(
+      __logical_device,
+      __num_items,
+      CUB_NS_QUALIFIER::DeviceReduce::Reduce,
+      (::cuda::std::ranges::begin(__inputs),
+       // Similarly to above, prepare for the comm calls later. In order for those to be
+       // in-place, the sendbuff = recvbuff + rank, so we need to place our partial result
+       // there
+       __buff.begin() + __rank,
+       __num_items_fixed,
+       ::cuda::std::move(__op),
+       __init,
+       __env));
+  }
+  else
+  {
+    __CUDAX_MULTI_GPU_DISPATCH(
+      __logical_device,
+      __num_items,
+      CUB_NS_QUALIFIER::DeviceReduce::Reduce,
+      (::cuda::std::ranges::begin(__inputs),
+       // Similarly to above, prepare for the com calls later. In order for those to be
+       // in-place, the sendbuff = recvbuff + rank, so we need to place our partial result
+       // there
+       __buff.begin() + __rank,
+       __num_items_fixed,
+       ::cuda::std::move(__op),
+       CUB_NS_QUALIFIER::detail::reduce::no_init,
+       __env));
+  }
+  return {::cuda::std::move(__buff), ::cuda::std::move(__env), __stream};
+}
+
+template <class _CommRange, class _PartialType, class _RangeOfOutputIt, class _BinaryOp>
+void __direct_reduction(_CommRange&& __comms,
+                        const ::std::vector<_PartialType>& __partials,
+                        _RangeOfOutputIt&& __outputs,
+                        const _BinaryOp& __op)
+{
+  auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard();
+
+  for (const auto [__comm, __local, __out_it] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs))
+  {
+    __comm.all_reduce(
+      __guard,
+      __local.__buffer.data() + __comm.rank(),
+      ::cuda::std::to_address(__out_it),
+      /*__count=*/1,
+      __op,
+      __local.__stream);
+  }
+}
+
+template <class _CommRange, class _PartialType, class _RangeOfOutputIt, class _BinaryOp>
+void __two_stage_gather_reduction(
+  _CommRange&& __comms,
+  const ::std::vector<_PartialType>& __partials,
+  _RangeOfOutputIt&& __outputs,
+  const _BinaryOp& __op)
+{
+  {
+    auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard();
+
+    for (const auto& [__comm, __local] : ::cuda::std::ranges::views::zip(__comms, __partials))
+    {
+      auto* const __ptr = __local.__buffer.data();
+
+      __comm.all_gather(__guard, __ptr + __comm.rank(), __ptr, /*__count=*/1, __local.__stream);
+    }
+  }
+
+  for (auto&& [__comm, __part, __out] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs))
+  {
+    auto&& [__buffer, __env, _] = __part;
+    const auto __num_items      = __buffer.size();
+
+    __CUDAX_MULTI_GPU_DISPATCH(
+      __comm.logical_device(),
+      __num_items,
+      CUB_NS_QUALIFIER::DeviceReduce::Reduce,
+      (__buffer.begin(), __out, __num_items_fixed, __op, CUB_NS_QUALIFIER::detail::reduce::no_init, __env));
+  }
+}
+} // namespace __detail::__reduce
+
+//! @brief Reduce each input range over its communicator and write one result per output
+//! iterator.
+//!
+//! Performs one reduction per communicator in parallel across devices. The communicators,
+//! environments, input ranges and output iterators are iterated in lockstep, so for the
+//! i-th element of each range the i-th input range is reduced with `__op` seeded by
+//! `__init` on the i-th communicator's devices, the partial results are combined across
+//! all ranks of that communicator, and the final value is written through the i-th output
+//! iterator.
+//!
+//! This routine is used when the current thread or process owns multiple local GPUs. For
+//! example, consider a scenario where there are 8 GPUs and 4 processes such that each
+//! process owns 2 GPUs. Then the user would call this routine on each process, passing in both
+//! local arrays:
+//! @code{.cpp}
+//! device_buffer gpu_0_data = ...;
+//! device_buffer gpu_1_data = ...;
+//!
+//! cudax::reduce({comm0, comm1},
+//!               {env_0, env_1},
+//!               {gpu_0_data, gpu_1_data},
+//!               {out_0, out_1},
+//!               ...)
+//! @endcode
+//!
+//! All ranges must have the same length. The algorithm will cap iteration to the shortest
+//! length, but this should not be relied upon and may change at any time, for any reason. So
+//! differing lengths is effectively undefined behavior.
+//!
+//! After this call returns, all local output iterators will hold the same value. In that sense
+//! this routine is similar to an "all reduce".
+//!
+//! @tparam _CommRange The range of communicators. Each element must model the communicator
+//!         concept.
+//! @tparam _EnvRange The range of execution environments. Each environment supplies the
+//!         stream and memory resource used for its communicator.
+//! @tparam _InputRangeOfRanges The range whose elements are the per-communicator input
+//!         ranges. Each element must be a sized random-access range.
+//! @tparam _RangeOfOutputIt The range of output iterators, one per communicator.
+//! @tparam _Tp The reduction and result value type. Deduced by default from the output
+//!         element type.
+//! @tparam _BinaryOp The binary reduction operator type. Defaults to `::cuda::std::plus<>`.
+//!
+//! @param[in] __comms The range of communicators.
+//! @param[in] __envs The range of execution environments.
+//! @param[in] __range_of_inputs The range of per-communicator input ranges to reduce.
+//! @param[out] __outputs The range of output iterators receiving the per-communicator results.
+//! @param[in] __init The initial value seeding each reduction.
+//! @param[in] __op The binary reduction operator.
+_CCCL_TEMPLATE(class _CommRange,
+               class _EnvRange,
+               class _InputRangeOfRanges,
+               class _RangeOfOutputIt,
+               class _Tp       = ::cuda::std::iter_value_t<::cuda::std::ranges::range_reference_t<_RangeOfOutputIt>>,
+               class _BinaryOp = ::cuda::std::plus<>)
+_CCCL_REQUIRES(__range_of_communicators<_CommRange> _CCCL_AND ::cuda::std::ranges::forward_range<_EnvRange> _CCCL_AND
+                 __detail::__range_of_sized_random_access_ranges<_InputRangeOfRanges> _CCCL_AND
+                   __detail::__range_of_output_iters<_RangeOfOutputIt, _Tp>)
+_CCCL_HOST_API void reduce(
+  _CommRange&& __comms,
+  _EnvRange&& __envs,
+  _InputRangeOfRanges&& __range_of_inputs,
+  _RangeOfOutputIt&& __outputs,
+  _Tp __init     = {},
+  _BinaryOp __op = {})
+{
+  static_assert(::cuda::std::ranges::sized_range<_CommRange>);
+
+  using __properties =
+    ::cuda::experimental::__detail::__in_range_out_it_properties<_InputRangeOfRanges, _RangeOfOutputIt, _EnvRange>;
+  using __partial_type = ::cuda::experimental::__detail::__reduce::__partial_redop<typename __properties::__buffer_type,
+                                                                                   typename __properties::__env_type>;
+
+  const auto __num_local = ::cuda::std::ranges::size(__comms);
+
+  if (!__num_local)
+  {
+    return;
+  }
+
+  auto __partials = ::std::vector<__partial_type>{};
+
+  __partials.reserve(__num_local);
+  // TODO(jfaibussowit): can just be ranges::zip | ranges::transform | ranges::to() (and then
+  // we don't need to do the env, and buffer type deduction upfront)
+  for (auto&& [__comm, __env, __inputs] : ::cuda::std::ranges::views::zip(__comms, __envs, __range_of_inputs))
+  {
+    __partials.emplace_back(
+      ::cuda::experimental::__detail::__reduce::__local_reduction<typename __partial_type::__buffer_type>(
+        /*__ROOT_RANK=*/0, __comm, __env, __inputs, __init, __op));
+  }
+
+  if constexpr (::cuda::experimental::__has_all_reduce<::cuda::std::ranges::range_value_t<_CommRange>,
+                                                       typename __properties::__output_type*>)
+  {
+    __direct_reduction(__comms, __partials, __outputs, __op);
+  }
+  else
+  {
+    __two_stage_gather_reduction(__comms, __partials, __outputs, __op);
+  }
+}
+
+//! @brief Reduce a single input range over a single communicator using the given execution
+//! environment.
+//!
+//! Convenience wrapper that forwards a single `(communicator, environment, input range,
+//! output iterator)` to the range-based overload. The input range is reduced with `__op`
+//! seeded by `__init` across the communicator's ranks and the final value is written
+//! through `__output`.
+//!
+//! @tparam _Comm The communicator type. Must model the communicator concept.
+//! @tparam _Env The execution environment type. Supplies the stream and memory resource.
+//! @tparam _InputRange The input range type. Must be a random-access range.
+//! @tparam _OutputIt The output iterator type.
+//! @tparam _Tp The reduction and result value type. Deduced by default from the output
+//! value type.
+//! @tparam _BinaryOp The binary reduction operator type. Defaults to `::cuda::std::plus<>`.
+//!
+//! @param[in] __comm The communicator.
+//! @param[in] __env The execution environment.
+//! @param[in] __inputs The input range to reduce.
+//! @param[out] __output The output iterator receiving the result.
+//! @param[in] __init The initial value seeding the reduction.
+//! @param[in] __op The binary reduction operator.
+_CCCL_TEMPLATE(class _Comm,
+               class _Env,
+               class _InputRange,
+               class _OutputIt,
+               class _Tp       = ::cuda::std::iter_value_t<_OutputIt>,
+               class _BinaryOp = ::cuda::std::plus<>)
+_CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange>
+                 _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>)
+_CCCL_HOST_API void
+reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {})
+{
+  reduce(::cuda::std::span<::cuda::std::remove_reference_t<_Comm>, 1>{::cuda::std::addressof(__comm), 1},
+         ::cuda::std::span<::cuda::std::remove_reference_t<_Env>, 1>{::cuda::std::addressof(__env), 1},
+         ::cuda::std::span<::cuda::std::remove_reference_t<_InputRange>, 1>{::cuda::std::addressof(__inputs), 1},
+         ::cuda::std::span<::cuda::std::remove_reference_t<_OutputIt>, 1>{::cuda::std::addressof(__output), 1},
+         ::cuda::std::move(__init),
+         ::cuda::std::move(__op));
+}
+
+//! @brief Reduce a single input range over a single communicator using
+//! `::cuda::std::plus<>` and a default-constructed initial value.
+//!
+//! Convenience wrapper that supplies a default execution environment and a default
+//! `::cuda::std::plus<>` reduction, forwarding to the environment-taking overload. The
+//! result value type is the input range's value type.
+//!
+//! @tparam _Comm The communicator type. Must model the communicator concept.
+//! @tparam _InputRange The input range type. Must be a random-access range.
+//! @tparam _OutputIt The output iterator type.
+//!
+//! @param[in] __comm The communicator.
+//! @param[in] __inputs The input range to reduce.
+//! @param[out] __output The output iterator receiving the result.
+_CCCL_TEMPLATE(class _Comm, class _InputRange, class _OutputIt)
+_CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange>
+                 _CCCL_AND ::cuda::std::output_iterator<_OutputIt, ::cuda::std::ranges::range_value_t<_InputRange>>)
+_CCCL_HOST_API void reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output)
+{
+  reduce(::cuda::std::forward<_Comm>(__comm),
+         ::cuda::std::execution::env<>{},
+         ::cuda::std::forward<_InputRange>(__inputs),
+         ::cuda::std::forward<_OutputIt>(__output));
+}
+} // namespace cuda::experimental
+
+// NOLINTEND(bugprone-reserved-identifier)
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_EXPERIMENTAL___MULTI_GPU_REDUCE_H
diff --git a/cudax/include/cuda/experimental/__multi_gpu/concepts.h b/cudax/include/cuda/experimental/__multi_gpu/concepts.h
index 2d805553aee..5b3a9f20e2f 100644
--- a/cudax/include/cuda/experimental/__multi_gpu/concepts.h
+++ b/cudax/include/cuda/experimental/__multi_gpu/concepts.h
@@ -26,6 +26,8 @@
 #include <cuda/std/__concepts/convertible_to.h>
 #include <cuda/std/__concepts/same_as.h>
 #include <cuda/std/__functional/operations.h>
+#include <cuda/std/__ranges/concepts.h>
+#include <cuda/std/__type_traits/remove_cvref.h>
 #include <cuda/std/__utility/declval.h>
 #include <cuda/std/cstdint>
 
@@ -214,6 +216,18 @@ _CCCL_CONCEPT __has_all_to_all_v = _CCCL_REQUIRES_EXPR(
     __recv_counts,
     __recv_displs,
     __stream));
+
+// ==========================================================================================
+
+template <class _Range>
+_CCCL_CONCEPT __range_of_communicators = _CCCL_REQUIRES_EXPR((_Range), )(
+  requires(::cuda::std::ranges::forward_range<_Range>),
+  requires(__communicator<::cuda::std::remove_cvref_t<::cuda::std::ranges::range_reference_t<_Range>>>));
+
+template <class _Range>
+_CCCL_CONCEPT __range_of_streams = _CCCL_REQUIRES_EXPR((_Range), )(
+  requires(::cuda::std::ranges::forward_range<_Range>),
+  requires(::cuda::std::ranges::__container_compatible_range<_Range, ::cuda::stream_ref>));
 } // namespace cuda::experimental
 // NOLINTEND(bugprone-reserved-identifier)
 
diff --git a/cudax/test/multi_gpu/CMakeLists.txt b/cudax/test/multi_gpu/CMakeLists.txt
index 0bef8b2bc2e..f617ecf5e3b 100644
--- a/cudax/test/multi_gpu/CMakeLists.txt
+++ b/cudax/test/multi_gpu/CMakeLists.txt
@@ -63,6 +63,7 @@ function(cudax_add_multi_gpu_test sub_prefix target_name_var source)
   set(${target_name_var} ${test_target} PARENT_SCOPE)
 endfunction()
 
+add_subdirectory(algorithms)
 add_subdirectory(communicators)
 add_subdirectory(concepts)
 add_subdirectory(nccl)
diff --git a/cudax/test/multi_gpu/algorithms/CMakeLists.txt b/cudax/test/multi_gpu/algorithms/CMakeLists.txt
new file mode 100644
index 00000000000..279f970b1af
--- /dev/null
+++ b/cudax/test/multi_gpu/algorithms/CMakeLists.txt
@@ -0,0 +1,11 @@
+#===----------------------------------------------------------------------===##
+#
+# Part of CUDA Experimental in CUDA C++ Core Libraries,
+# under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+#
+#===----------------------------------------------------------------------===##
+
+add_subdirectory(reduce)
diff --git a/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt b/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt
new file mode 100644
index 00000000000..1a43a0be4f8
--- /dev/null
+++ b/cudax/test/multi_gpu/algorithms/reduce/CMakeLists.txt
@@ -0,0 +1,20 @@
+#===----------------------------------------------------------------------===##
+#
+# Part of CUDA Experimental in CUDA C++ Core Libraries,
+# under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+#
+#===----------------------------------------------------------------------===##
+
+if (NOT cudax_ENABLE_NCCL)
+  return()
+endif()
+
+file(GLOB test_srcs LIST_DIRECTORIES FALSE CONFIGURE_DEPENDS *.cu *.cpp)
+
+foreach (src IN LISTS test_srcs)
+  cudax_add_multi_gpu_test("algorithms.reduce" test_target "${src}")
+  target_link_libraries(${test_target} PRIVATE NCCL::nccl)
+endforeach()
diff --git a/cudax/test/multi_gpu/algorithms/reduce/basic.cu b/cudax/test/multi_gpu/algorithms/reduce/basic.cu
new file mode 100644
index 00000000000..babffa8ff50
--- /dev/null
+++ b/cudax/test/multi_gpu/algorithms/reduce/basic.cu
@@ -0,0 +1,322 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/buffer>
+#include <cuda/functional>
+#include <cuda/memory_resource>
+#include <cuda/std/cstdint>
+#include <cuda/std/execution>
+#include <cuda/std/functional>
+#include <cuda/std/type_traits>
+
+#include <cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h>
+
+#include <numeric>
+#include <vector>
+
+#include <testing.cuh>
+
+#include "../../communicators/nccl/nccl_test_helpers.cuh"
+
+namespace
+{
+using custom_value = c2h::custom_type_t<c2h::accumulateable_t, c2h::less_comparable_t, c2h::equal_comparable_t>;
+static_assert(cudax::nccl_transportable<custom_value>);
+
+template <typename T>
+T make_value(int i)
+{
+  return static_cast<T>(i);
+}
+
+template <>
+custom_value make_value<>(int i)
+{
+  return {static_cast<cuda::std::size_t>(i), static_cast<cuda::std::size_t>(i)};
+};
+
+// A custom reduction operator equivalent to `cuda::std::plus<>`.
+struct custom_plus
+{
+  template <class T>
+  _CCCL_HOST_DEVICE constexpr T operator()(const T& lhs, const T& rhs) const
+  {
+    return lhs + rhs;
+  }
+};
+
+using value_types = c2h::type_list<cuda::std::int32_t, float, custom_value>;
+using operators   = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>, custom_plus>;
+
+// One output iterator per local output buffer. Collected after `out` is fully built so the
+// iterators do not dangle across reallocations.
+template <class OutBuffers>
+[[nodiscard]] auto make_output_iterators(OutBuffers& out)
+{
+  std::vector<typename cuda::std::remove_cvref_t<decltype(out.front())>::iterator> outputs;
+
+  outputs.reserve(out.size());
+  for (auto& buf : out)
+  {
+    outputs.push_back(buf.begin());
+  }
+  return outputs;
+}
+
+// Run the full reduction, wait for it to finish, and check that `reduce` left its argument ranges
+// untouched. This boilerplate is identical for every test regardless of how the inputs are shaped.
+template <class Comms, class Envs, class In, class Outputs, class T, class Op, class Streams>
+void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init, Op op, Streams& streams)
+{
+  using value_type = typename cuda::std::remove_cvref_t<decltype(in.front())>::value_type;
+
+  // cuda::std::execution::env has no operator==, so we can only compare the sizes.
+  const auto envs_size = envs.size();
+
+  auto pool = cuda::mr::legacy_pinned_memory_resource{};
+  std::vector<cuda::host_buffer<value_type>> in_copy;
+  in_copy.reserve(in.size());
+  for (const auto& buf : in)
+  {
+    in_copy.emplace_back(cuda::make_buffer(buf.stream(), pool, buf));
+  }
+
+  const auto outputs_copy = outputs;
+
+  cudax::reduce(comms, envs, in, outputs, init, op);
+
+  for (auto& stream : streams)
+  {
+    stream.sync();
+  }
+
+  REQUIRE(envs.size() == envs_size);
+  for (cuda::std::size_t i = 0; i < in.size(); ++i)
+  {
+    const auto actual = cuda::make_buffer(in[i].stream(), pool, in[i]);
+    REQUIRE_THAT(actual, Equals(in_copy[i]));
+  }
+  REQUIRE(outputs == outputs_copy);
+}
+} // namespace
+
+MGMN_TEST("reduce, one element per rank", value_types, operators)
+{
+  using T  = c2h::get<0, TestType>;
+  using Op = c2h::get<1, TestType>;
+
+  // Seed each reduction with a few hardcoded initializers. The init participates in the fold the
+  // same way on host and device, so any value works for every operator under test.
+  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+
+  auto comms   = this->communicators();
+  auto streams = nccl_test_util::make_streams();
+
+  // Global rank `comms[i].rank()` contributes the single value `rank`. Each local rank also gets a
+  // one-element output buffer and an environment carrying its stream, so the reduction is
+  // stream-ordered on the correct device. `reference` mirrors the contributions of every global
+  // rank so we can fold them on the host exactly like `reduce` does on the device.
+  std::vector<cuda::device_buffer<T>> in;
+  std::vector<cuda::device_buffer<T>> out;
+  std::vector<decltype(::cuda::std::execution::env{::cuda::stream_ref{streams[0]}})> envs;
+  std::vector<T> reference;
+
+  in.reserve(comms.size());
+  out.reserve(comms.size());
+  envs.reserve(comms.size());
+  for (int i = 0; i < comms.size(); ++i)
+  {
+    const auto values = {make_value<T>(comms[i].rank())};
+    in.emplace_back(cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), values));
+    out.emplace_back(
+      cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init));
+    envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}});
+  }
+  for (int r = 0; r < comms.front().size(); ++r)
+  {
+    reference.push_back(make_value<T>(r));
+  }
+
+  auto outputs = make_output_iterators(out);
+
+  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+
+  const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{});
+
+  for (const auto& buf : out)
+  {
+    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
+    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+
+    REQUIRE(actual.size() == 1);
+    REQUIRE(actual.front() == expected);
+  }
+}
+
+MGMN_TEST("reduce, multiple elements per rank", value_types, operators)
+{
+  using T  = c2h::get<0, TestType>;
+  using Op = c2h::get<1, TestType>;
+
+  // Seed each reduction with a few hardcoded initializers. The init participates in the fold the
+  // same way on host and device, so any value works for every operator under test.
+  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+
+  auto comms   = this->communicators();
+  auto streams = nccl_test_util::make_streams();
+
+  // Global rank `comms[i].rank()` contributes `{rank, rank, rank}`. `reduce` first does a local CUB
+  // reduction of each rank's range, then combines the partials across ranks. Each local rank also
+  // gets a one-element output buffer and an environment carrying its stream. `reference` mirrors
+  // every global rank's three contributions for the host-side fold.
+  std::vector<cuda::device_buffer<T>> in;
+  std::vector<cuda::device_buffer<T>> out;
+  std::vector<decltype(::cuda::std::execution::env{::cuda::stream_ref{streams[0]}})> envs;
+  std::vector<T> reference;
+
+  in.reserve(comms.size());
+  out.reserve(comms.size());
+  envs.reserve(comms.size());
+  for (int i = 0; i < comms.size(); ++i)
+  {
+    const auto v      = make_value<T>(comms[i].rank());
+    const auto values = {v, v, v};
+    in.emplace_back(cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), values));
+    out.emplace_back(
+      cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init));
+    envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}});
+  }
+  for (int r = 0; r < comms.front().size(); ++r)
+  {
+    const auto v = make_value<T>(r);
+    reference.insert(reference.end(), {v, v, v});
+  }
+
+  auto outputs = make_output_iterators(out);
+
+  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+
+  const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{});
+
+  for (const auto& buf : out)
+  {
+    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
+    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+
+    REQUIRE(actual.size() == 1);
+    REQUIRE(actual.front() == expected);
+  }
+}
+
+MGMN_TEST("reduce, some ranks empty", value_types, operators)
+{
+  using T  = c2h::get<0, TestType>;
+  using Op = c2h::get<1, TestType>;
+
+  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+
+  auto comms   = this->communicators();
+  auto streams = nccl_test_util::make_streams();
+
+  // Even global ranks contribute two copies of `rank`; odd global ranks contribute an empty input
+  // range. Rank 0 (the reduction root) is always non-empty. `reduce` must treat an empty rank as
+  // contributing nothing, exactly like `std::accumulate` over the surviving elements. `reference`
+  // mirrors that for the host-side fold.
+  std::vector<cuda::device_buffer<T>> in;
+  std::vector<cuda::device_buffer<T>> out;
+  std::vector<decltype(::cuda::std::execution::env{::cuda::stream_ref{streams[0]}})> envs;
+  std::vector<T> reference;
+
+  in.reserve(comms.size());
+  out.reserve(comms.size());
+  envs.reserve(comms.size());
+  for (int i = 0; i < comms.size(); ++i)
+  {
+    const auto rank = comms[i].rank();
+    if (rank % 2 == 0)
+    {
+      const auto values = {make_value<T>(rank), make_value<T>(rank)};
+      in.emplace_back(cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), values));
+    }
+    else
+    {
+      in.emplace_back(cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device()));
+    }
+    out.emplace_back(
+      cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init));
+    envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}});
+  }
+
+  for (int r = 0; r < comms.front().size(); r += 2)
+  {
+    reference.push_back(make_value<T>(r));
+    reference.push_back(make_value<T>(r));
+  }
+
+  auto outputs = make_output_iterators(out);
+
+  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+
+  const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{});
+
+  for (const auto& buf : out)
+  {
+    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
+    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+
+    REQUIRE(actual.size() == 1);
+    REQUIRE(actual.front() == expected);
+  }
+}
+
+MGMN_TEST("reduce, all ranks empty", value_types, operators)
+{
+  using T  = c2h::get<0, TestType>;
+  using Op = c2h::get<1, TestType>;
+
+  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+
+  auto comms   = this->communicators();
+  auto streams = nccl_test_util::make_streams();
+
+  // No rank contributes any element. Reducing nothing seeded by `init` is just `init`, so every
+  // output must equal `init` regardless of the operator.
+  std::vector<cuda::device_buffer<T>> in;
+  std::vector<cuda::device_buffer<T>> out;
+  std::vector<decltype(::cuda::std::execution::env{::cuda::stream_ref{streams[0]}})> envs;
+
+  in.reserve(comms.size());
+  out.reserve(comms.size());
+  envs.reserve(comms.size());
+  for (int i = 0; i < comms.size(); ++i)
+  {
+    in.emplace_back(cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device()));
+    out.emplace_back(
+      cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init));
+    envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}});
+  }
+
+  auto outputs = make_output_iterators(out);
+
+  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+
+  // Reducing nothing seeded by `init` yields `init`, exactly like `std::accumulate` over an empty
+  // range.
+  const T expected = init;
+
+  for (const auto& buf : out)
+  {
+    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
+    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+
+    REQUIRE(actual.size() == 1);
+    REQUIRE(actual.front() == expected);
+  }
+}
diff --git a/cudax/test/multi_gpu/communicators/nccl/basic.cu b/cudax/test/multi_gpu/communicators/nccl/basic.cu
index 1ba084b8adf..f21f209e18b 100644
--- a/cudax/test/multi_gpu/communicators/nccl/basic.cu
+++ b/cudax/test/multi_gpu/communicators/nccl/basic.cu
@@ -32,7 +32,7 @@ C2H_TEST("nccl_communicator_ref not constructible from NCCL_COMM_NULL", "[multi_
   STATIC_REQUIRE(!::cuda::std::is_constructible_v<cudax::nccl_communicator_ref, cuda::std::nullptr_t>);
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref basic")
+MGMN_TEST("nccl_communicator_ref basic", )
 {
   SECTION("rank and size")
   {
diff --git a/cudax/test/multi_gpu/communicators/nccl/collective.cu b/cudax/test/multi_gpu/communicators/nccl/collective.cu
index 92a1402f721..96f73edfa21 100644
--- a/cudax/test/multi_gpu/communicators/nccl/collective.cu
+++ b/cudax/test/multi_gpu/communicators/nccl/collective.cu
@@ -26,7 +26,7 @@ namespace
 constexpr cuda::std::int32_t ROOT_RANK = 0;
 } // namespace
 
-NCCL_COMM_TEST("nccl_communicator_ref all_reduce sum")
+MGMN_TEST("nccl_communicator_ref all_reduce sum", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -69,7 +69,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_reduce sum")
   }
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref all_reduce maximum")
+MGMN_TEST("nccl_communicator_ref all_reduce maximum", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -112,7 +112,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_reduce maximum")
   }
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref reduce sum to root 0")
+MGMN_TEST("nccl_communicator_ref reduce sum to root 0", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -150,7 +150,7 @@ NCCL_COMM_TEST("nccl_communicator_ref reduce sum to root 0")
   REQUIRE_THAT(actual, Equals(expected));
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref broadcast from root 0")
+MGMN_TEST("nccl_communicator_ref broadcast from root 0", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -192,7 +192,7 @@ NCCL_COMM_TEST("nccl_communicator_ref broadcast from root 0")
   }
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref all_gather")
+MGMN_TEST("nccl_communicator_ref all_gather", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -241,7 +241,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_gather")
   }
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref gather_v to root 0")
+MGMN_TEST("nccl_communicator_ref gather_v to root 0", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -308,7 +308,7 @@ NCCL_COMM_TEST("nccl_communicator_ref gather_v to root 0")
   REQUIRE_THAT(actual, Equals(expected));
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref all_to_all_v")
+MGMN_TEST("nccl_communicator_ref all_to_all_v", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -379,7 +379,7 @@ NCCL_COMM_TEST("nccl_communicator_ref all_to_all_v")
 
 #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 0)
 
-NCCL_COMM_TEST("nccl_communicator_ref gather to root 0")
+MGMN_TEST("nccl_communicator_ref gather to root", )
 {
   auto streams = nccl_test_util::make_streams();
 
@@ -424,7 +424,7 @@ NCCL_COMM_TEST("nccl_communicator_ref gather to root 0")
   REQUIRE_THAT(actual, Equals(expected));
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref all_to_all")
+MGMN_TEST("nccl_communicator_ref all_to_all", )
 {
   auto streams = nccl_test_util::make_streams();
 
diff --git a/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh b/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh
index fc5d1298b30..456fcf6120a 100644
--- a/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh
+++ b/cudax/test/multi_gpu/communicators/nccl/nccl_test_helpers.cuh
@@ -99,7 +99,7 @@ private:
   std::vector<cudax::nccl_communicator_ref> wrappers_{};
 };
 
-#define NCCL_COMM_TEST(NAME, ...) \
+#define MGMN_TEST(NAME, ...) \
   C2H_TEST_WITH_FIXTURE(::nccl_test_util::nccl_comm_fixture, NAME, "[multi_gpu][nccl]", __VA_ARGS__)
 } // namespace nccl_test_util
 
diff --git a/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu b/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu
index 0f6f2202ed3..c96989c60f4 100644
--- a/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu
+++ b/cudax/test/multi_gpu/communicators/nccl/peer_to_peer.cu
@@ -28,7 +28,7 @@ struct payload
 } // namespace
 
 // Ring exchange via send/recv. Rank r contributes {r, r, r}.
-NCCL_COMM_TEST("nccl_communicator_ref send/recv ring")
+MGMN_TEST("nccl_communicator_ref send/recv ring", )
 {
   if (cuda::devices.size() == 1)
   {
@@ -84,7 +84,7 @@ NCCL_COMM_TEST("nccl_communicator_ref send/recv ring")
   }
 }
 
-NCCL_COMM_TEST("nccl_communicator_ref send/recv transports trivially copyable payload")
+MGMN_TEST("nccl_communicator_ref send/recv transports trivially copyable payload", )
 {
   if (cuda::devices.size() == 1)
   {
diff --git a/libcudacxx/include/cuda/__container/buffer.h b/libcudacxx/include/cuda/__container/buffer.h
index e77f094a7de..059d8f1b44a 100644
--- a/libcudacxx/include/cuda/__container/buffer.h
+++ b/libcudacxx/include/cuda/__container/buffer.h
@@ -51,6 +51,7 @@
 #  include <cuda/std/__ranges/size.h>
 #  include <cuda/std/__ranges/unwrap_end.h>
 #  include <cuda/std/__type_traits/decay.h>
+#  include <cuda/std/__type_traits/remove_cvref.h>
 #  include <cuda/std/__utility/forward.h>
 #  include <cuda/std/__utility/move.h>
 #  include <cuda/std/cstdint>
@@ -62,9 +63,19 @@
 //! @brief The \c buffer class provides a container of contiguous memory
 _CCCL_BEGIN_NAMESPACE_CUDA
 
+template <class _Tp>
+inline constexpr bool __is_any_env = false;
+
+template <class... _Tp>
+inline constexpr bool __is_any_env<::cuda::std::execution::env<_Tp...>> = true;
+
+// TODO:
+//
+// Find a way to enable checks again without bricking for environments that clearly (which we
+// can somehow detect) dont intend to have alignment.
 template <class _Env>
 inline constexpr bool __buffer_compatible_env =
-  ::cuda::std::is_same_v<::cuda::std::decay_t<_Env>, ::cuda::std::execution::env<>>
+  __is_any_env<::cuda::std::remove_cvref_t<_Env>>
   || ::cuda::std::execution::__queryable_with<const _Env&, allocation_alignment_t>;
 
 _CCCL_BEGIN_NAMESPACE_ABI_VER4_BUMP

From b1ece662a8ba0f1bf18bc59423429edbf222a9fb Mon Sep 17 00:00:00 2001
From: Jacob Faibussowitsch <jacob.fai@gmail.com>
Date: Wed, 1 Jul 2026 14:17:44 -0400
Subject: [PATCH 5/7] fixup! Add multi-GPU reduce

---
 .../__multi_gpu/algorithm/reduce/reduce.h     |  15 +-
 .../algorithms/reduce/range_defaults.cu       | 140 ++++++++++++++++++
 2 files changed, 151 insertions(+), 4 deletions(-)
 create mode 100644 cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu

diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
index 778414b6af4..926551546b3 100644
--- a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
+++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
@@ -330,15 +330,22 @@ reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output,
 //! @param[in] __comm The communicator.
 //! @param[in] __inputs The input range to reduce.
 //! @param[out] __output The output iterator receiving the result.
-_CCCL_TEMPLATE(class _Comm, class _InputRange, class _OutputIt)
+_CCCL_TEMPLATE(class _Comm,
+               class _InputRange,
+               class _OutputIt,
+               class _Tp       = ::cuda::std::iter_value_t<_OutputIt>,
+               class _BinaryOp = ::cuda::std::plus<>)
 _CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange>
-                 _CCCL_AND ::cuda::std::output_iterator<_OutputIt, ::cuda::std::ranges::range_value_t<_InputRange>>)
-_CCCL_HOST_API void reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output)
+                 _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>)
+_CCCL_HOST_API void
+reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {})
 {
   reduce(::cuda::std::forward<_Comm>(__comm),
          ::cuda::std::execution::env<>{},
          ::cuda::std::forward<_InputRange>(__inputs),
-         ::cuda::std::forward<_OutputIt>(__output));
+         ::cuda::std::forward<_OutputIt>(__output),
+         ::cuda::std::move(__init),
+         ::cuda::std::move(__op));
 }
 } // namespace cuda::experimental
 
diff --git a/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu
new file mode 100644
index 00000000000..e37ffffbaf2
--- /dev/null
+++ b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu
@@ -0,0 +1,140 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/buffer>
+#include <cuda/memory_resource>
+#include <cuda/std/cstdint>
+#include <cuda/std/execution>
+#include <cuda/std/type_traits>
+
+#include <cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h>
+
+#include <numeric>
+#include <vector>
+
+#include <testing.cuh>
+
+#include "../../communicators/nccl/nccl_test_helpers.cuh"
+
+namespace
+{
+using value_type = cuda::std::int32_t;
+
+template <class OutBuffers>
+[[nodiscard]] auto make_output_iterators(OutBuffers& out)
+{
+  std::vector<typename cuda::std::remove_cvref_t<decltype(out.front())>::iterator> outputs;
+
+  outputs.reserve(out.size());
+  for (auto& buf : out)
+  {
+    outputs.push_back(buf.begin());
+  }
+  return outputs;
+}
+} // namespace
+
+MGMN_TEST("reduce, range overload with default initializer and default plus", )
+{
+  auto comms   = this->communicators();
+  auto streams = nccl_test_util::make_streams();
+
+  std::vector<cuda::device_buffer<value_type>> in;
+  std::vector<cuda::device_buffer<value_type>> out;
+  std::vector<decltype(::cuda::std::execution::env{::cuda::stream_ref{streams[0]}})> envs;
+  std::vector<value_type> reference;
+
+  in.reserve(comms.size());
+  out.reserve(comms.size());
+  envs.reserve(comms.size());
+  for (int i = 0; i < comms.size(); ++i)
+  {
+    const auto values = {static_cast<value_type>(comms[i].rank())};
+    in.emplace_back(
+      cuda::make_device_buffer<value_type>(streams[i], comms[i].logical_device().underlying_device(), values));
+    out.emplace_back(cuda::make_device_buffer<value_type>(
+      streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init));
+    envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}});
+  }
+  for (int r = 0; r < comms.front().size(); ++r)
+  {
+    reference.push_back(static_cast<value_type>(r));
+  }
+
+  auto outputs = make_output_iterators(out);
+
+  cudax::reduce(comms, envs, in, outputs);
+
+  for (auto& stream : streams)
+  {
+    stream.sync();
+  }
+
+  const value_type expected = std::accumulate(reference.begin(), reference.end(), value_type{});
+
+  for (const auto& buf : out)
+  {
+    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
+    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+
+    REQUIRE(actual.size() == 1);
+    REQUIRE(actual.front() == expected);
+  }
+}
+
+MGMN_TEST("reduce, range overload with explicit initializer and default plus", )
+{
+  const value_type init{10};
+
+  auto comms   = this->communicators();
+  auto streams = nccl_test_util::make_streams();
+
+  std::vector<cuda::device_buffer<value_type>> in;
+  std::vector<cuda::device_buffer<value_type>> out;
+  std::vector<decltype(::cuda::std::execution::env{::cuda::stream_ref{streams[0]}})> envs;
+  std::vector<value_type> reference;
+
+  in.reserve(comms.size());
+  out.reserve(comms.size());
+  envs.reserve(comms.size());
+  for (int i = 0; i < comms.size(); ++i)
+  {
+    const auto values = {static_cast<value_type>(comms[i].rank())};
+    in.emplace_back(
+      cuda::make_device_buffer<value_type>(streams[i], comms[i].logical_device().underlying_device(), values));
+    out.emplace_back(cuda::make_device_buffer<value_type>(
+      streams[i], comms[i].logical_device().underlying_device(), 1, cuda::no_init));
+    envs.emplace_back(::cuda::std::execution::env{::cuda::stream_ref{streams[i]}});
+  }
+  for (int r = 0; r < comms.front().size(); ++r)
+  {
+    reference.push_back(static_cast<value_type>(r));
+  }
+
+  auto outputs = make_output_iterators(out);
+
+  cudax::reduce(comms, envs, in, outputs, init);
+
+  for (auto& stream : streams)
+  {
+    stream.sync();
+  }
+
+  const value_type expected = std::accumulate(reference.begin(), reference.end(), init);
+
+  for (const auto& buf : out)
+  {
+    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
+    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+
+    REQUIRE(actual.size() == 1);
+    REQUIRE(actual.front() == expected);
+  }
+}

From f88f027ed1dcf1947e446a75ba65feaf3d52c6be Mon Sep 17 00:00:00 2001
From: Jacob Faibussowitsch <jacob.fai@gmail.com>
Date: Thu, 2 Jul 2026 10:12:03 -0400
Subject: [PATCH 6/7] fixup! Add multi-GPU reduce

---
 .../__multi_gpu/algorithm/reduce/reduce.h     | 24 ++++++++-----------
 .../cuda/experimental/__multi_gpu/concepts.h  | 12 +++++-----
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
index 926551546b3..78d49ef4259 100644
--- a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
+++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
@@ -121,14 +121,12 @@ template <class _Buffer, class _Comm, class _Env, class _InputRange, class _Tp,
 }
 
 template <class _CommRange, class _PartialType, class _RangeOfOutputIt, class _BinaryOp>
-void __direct_reduction(_CommRange&& __comms,
-                        const ::std::vector<_PartialType>& __partials,
-                        _RangeOfOutputIt&& __outputs,
-                        const _BinaryOp& __op)
+void __direct_reduction(
+  _CommRange&& __comms, _RangeOfOutputIt&& __outputs, const _BinaryOp& __op, ::std::vector<_PartialType>* __partials)
 {
   auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard();
 
-  for (const auto [__comm, __local, __out_it] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs))
+  for (auto&& [__comm, __local, __out_it] : ::cuda::std::ranges::views::zip(__comms, *__partials, __outputs))
   {
     __comm.all_reduce(
       __guard,
@@ -142,15 +140,12 @@ void __direct_reduction(_CommRange&& __comms,
 
 template <class _CommRange, class _PartialType, class _RangeOfOutputIt, class _BinaryOp>
 void __two_stage_gather_reduction(
-  _CommRange&& __comms,
-  const ::std::vector<_PartialType>& __partials,
-  _RangeOfOutputIt&& __outputs,
-  const _BinaryOp& __op)
+  _CommRange&& __comms, _RangeOfOutputIt&& __outputs, const _BinaryOp& __op, ::std::vector<_PartialType>* __partials)
 {
   {
     auto&& __guard = ::cuda::std::ranges::begin(__comms)->group_guard();
 
-    for (const auto& [__comm, __local] : ::cuda::std::ranges::views::zip(__comms, __partials))
+    for (auto&& [__comm, __local] : ::cuda::std::ranges::views::zip(__comms, *__partials))
     {
       auto* const __ptr = __local.__buffer.data();
 
@@ -158,7 +153,7 @@ void __two_stage_gather_reduction(
     }
   }
 
-  for (auto&& [__comm, __part, __out] : ::cuda::std::ranges::views::zip(__comms, __partials, __outputs))
+  for (auto&& [__comm, __part, __out] : ::cuda::std::ranges::views::zip(__comms, *__partials, __outputs))
   {
     auto&& [__buffer, __env, _] = __part;
     const auto __num_items      = __buffer.size();
@@ -265,13 +260,14 @@ _CCCL_HOST_API void reduce(
   }
 
   if constexpr (::cuda::experimental::__has_all_reduce<::cuda::std::ranges::range_value_t<_CommRange>,
-                                                       typename __properties::__output_type*>)
+                                                       typename __properties::__output_type*,
+                                                       _BinaryOp>)
   {
-    __direct_reduction(__comms, __partials, __outputs, __op);
+    __direct_reduction(__comms, __outputs, __op, &__partials);
   }
   else
   {
-    __two_stage_gather_reduction(__comms, __partials, __outputs, __op);
+    __two_stage_gather_reduction(__comms, __outputs, __op, &__partials);
   }
 }
 
diff --git a/cudax/include/cuda/experimental/__multi_gpu/concepts.h b/cudax/include/cuda/experimental/__multi_gpu/concepts.h
index 5b3a9f20e2f..6c94acf7ec2 100644
--- a/cudax/include/cuda/experimental/__multi_gpu/concepts.h
+++ b/cudax/include/cuda/experimental/__multi_gpu/concepts.h
@@ -80,14 +80,14 @@ _CCCL_CONCEPT __communicator = _CCCL_REQUIRES_EXPR((_Comm), _Comm& __comm)(
 
 // Use a typed pointer as default here, since the op may need to instantiated with a
 // dereferenceable pointer type for reductions
-template <class _Comm, class _Ptr = int*>
+template <class _Comm, class _Ptr = int*, class _Op = cuda::std::plus<>>
 _CCCL_CONCEPT __has_reduce = _CCCL_REQUIRES_EXPR(
-  (_Comm, _Ptr),
+  (_Comm, _Ptr, _Op),
   _Comm& __comm,
   _Ptr __sendbuff,
   _Ptr __recvbuff,
   ::cuda::std::size_t __count,
-  ::cuda::std::plus<> __op,
+  _Op __op,
   ::cuda::std::int32_t __root,
   ::cuda::stream_ref __stream)(
   requires(__communicator<_Comm>),
@@ -98,14 +98,14 @@ _CCCL_CONCEPT __has_reduce = _CCCL_REQUIRES_EXPR(
 
 // Use a typed pointer as default here, since the op may need to instantiated with a
 // dereferenceable pointer type for reductions
-template <class _Comm, class _Ptr = int*>
+template <class _Comm, class _Ptr = int*, class _Op = ::cuda::std::plus<>>
 _CCCL_CONCEPT __has_all_reduce = _CCCL_REQUIRES_EXPR(
-  (_Comm, _Ptr),
+  (_Comm, _Ptr, _Op),
   _Comm& __comm,
   _Ptr __sendbuff,
   _Ptr __recvbuff,
   ::cuda::std::size_t __count,
-  ::cuda::std::plus<> __op,
+  _Op __op,
   ::cuda::stream_ref __stream)(
   requires(__communicator<_Comm>),
   _Same_as(void) __comm.all_reduce(

From b9b17664a74f6737ac9a11dbed6fbce3d1f43662 Mon Sep 17 00:00:00 2001
From: Jacob Faibussowitsch <jacob.fai@gmail.com>
Date: Thu, 2 Jul 2026 14:43:02 -0400
Subject: [PATCH 7/7] fixup! Add multi-GPU reduce

---
 c2h/include/c2h/catch2_test_helper.h          | 29 +++++-
 c2h/include/c2h/vector.h                      |  2 +
 .../__multi_gpu/algorithm/reduce/reduce.h     | 54 +++++++---
 .../test/multi_gpu/algorithms/reduce/basic.cu | 98 +++++++++++++------
 .../algorithms/reduce/range_defaults.cu       | 16 ++-
 5 files changed, 137 insertions(+), 62 deletions(-)

diff --git a/c2h/include/c2h/catch2_test_helper.h b/c2h/include/c2h/catch2_test_helper.h
index 923fe077319..9c60d84ff6a 100644
--- a/c2h/include/c2h/catch2_test_helper.h
+++ b/c2h/include/c2h/catch2_test_helper.h
@@ -5,6 +5,7 @@
 
 #include <cuda/std/detail/__config>
 
+#include <cuda/__memory_resource/legacy_pinned_memory_resource.h>
 #include <cuda/__nvtx/nvtx.h>
 #include <cuda/buffer>
 #include <cuda/std/bit>
@@ -360,8 +361,8 @@ struct vector_compare_result_t
   std::optional<std::vector<element_compare_result_t<T>>> last_mismatches;
 };
 
-template <typename T>
-auto compare_vectors(const host_vector<T>& actual, const host_vector<T>& expected) -> vector_compare_result_t<T>
+template <typename LhsRange, typename RhsRange, typename T = typename LhsRange::value_type>
+auto compare_host_ranges(const LhsRange& actual, const RhsRange& expected) -> vector_compare_result_t<T>
 {
   constexpr size_t good_values_before_mismatch = 3;
   constexpr size_t first_mismatches_count      = 5;
@@ -411,6 +412,12 @@ auto compare_vectors(const host_vector<T>& actual, const host_vector<T>& expecte
   return result;
 }
 
+template <typename T>
+auto compare_vectors(const host_vector<T>& actual, const host_vector<T>& expected) -> vector_compare_result_t<T>
+{
+  return compare_host_ranges(actual, expected);
+}
+
 template <typename T>
 auto compare_vectors(const device_vector<T>& actual, const device_vector<T>& expected) -> vector_compare_result_t<T>
 {
@@ -421,10 +428,12 @@ template <typename T, typename... LhsProps, typename... RhsProps>
 auto compare_vectors(const cuda::buffer<T, LhsProps...>& actual, const cuda::buffer<T, RhsProps...>& expected)
   -> vector_compare_result_t<T>
 {
+  const auto actual_host   = cuda::make_buffer(actual.stream(), cuda::mr::legacy_pinned_memory_resource{}, actual);
+  const auto expected_host = cuda::make_buffer(expected.stream(), cuda::mr::legacy_pinned_memory_resource{}, expected);
+
   actual.stream().sync();
   expected.stream().sync();
-  return compare_vectors<T>(host_vector<T>(actual.begin(), actual.end()),
-                            host_vector<T>(expected.begin(), expected.end()));
+  return compare_host_ranges(actual_host, expected_host);
 }
 
 template <typename LhsVec, typename RhsVec, typename T = typename LhsVec::value_type>
@@ -559,6 +568,18 @@ ::std::ostream& operator<<(::std::ostream& os, const tuple<T...>& tup)
 }
 _CCCL_END_NAMESPACE_CUDA_STD
 
+_CCCL_BEGIN_NAMESPACE_CUDA
+template <typename T, typename... Props>
+::std::ostream& operator<<(::std::ostream& os, const cuda::buffer<T, Props...>& buffer)
+{
+  const auto host_buf = cuda::make_buffer(buffer.stream(), cuda::mr::legacy_pinned_memory_resource{}, buffer);
+
+  buffer.stream().sync();
+  os << ::Catch::Detail::stringify(::std::vector<T>{host_buf.begin(), host_buf.end()});
+  return os;
+}
+_CCCL_END_NAMESPACE_CUDA
+
 template <>
 struct Catch::StringMaker<cudaError>
 {
diff --git a/c2h/include/c2h/vector.h b/c2h/include/c2h/vector.h
index d26cfa6a206..0329a2ddaca 100644
--- a/c2h/include/c2h/vector.h
+++ b/c2h/include/c2h/vector.h
@@ -5,6 +5,8 @@
 
 #include <thrust/detail/vector_base.h>
 
+#include <cuda/__memory_resource/legacy_pinned_memory_resource.h>
+#include <cuda/buffer>
 #include <cuda/std/type_traits>
 
 #include <memory>
diff --git a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
index 78d49ef4259..6650508a1bc 100644
--- a/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
+++ b/cudax/include/cuda/experimental/__multi_gpu/algorithm/reduce/reduce.h
@@ -25,6 +25,7 @@
 #include <cub/device/device_reduce.cuh>
 #include <cub/device/dispatch/kernels/kernel_reduce.cuh>
 
+#include <cuda/__functional/operator_properties.h>
 #include <cuda/__memory_resource/resource.h>
 #include <cuda/__runtime/ensure_current_context.h>
 #include <cuda/__stream/get_stream.h>
@@ -69,16 +70,13 @@ template <class _Buffer, class _Comm, class _Env, class _InputRange, class _Tp,
   _Env __env,
   _InputRange&& __inputs,
   const _Tp& __init,
+  const _Tp& __ident,
   _BinaryOp __op)
 {
   const auto& __logical_device = __comm.logical_device();
   auto __stream                = __stream_from_env(__env);
   auto __resource              = __resource_from_env(__env, __logical_device.underlying_device());
-
-  static_assert(::cuda::mr::resource_with<decltype(__resource), ::cuda::mr::device_accessible>,
-                "Provided memory resource must be device accessible");
-
-  const auto __num_items = ::cuda::std::ranges::size(__inputs);
+  const auto __num_items       = ::cuda::std::ranges::size(__inputs);
   // Allocate enough storage so that we can use the buffer directly in an in-place comm all
   // gather/all reduce call. Those calls require that the receive buffer is of size nranks *
   // sendcount.
@@ -114,9 +112,10 @@ template <class _Buffer, class _Comm, class _Env, class _InputRange, class _Tp,
        __buff.begin() + __rank,
        __num_items_fixed,
        ::cuda::std::move(__op),
-       CUB_NS_QUALIFIER::detail::reduce::no_init,
+       __ident,
        __env));
   }
+
   return {::cuda::std::move(__buff), ::cuda::std::move(__env), __stream};
 }
 
@@ -199,6 +198,10 @@ void __two_stage_gather_reduction(
 //! After this call returns, all local output iterators will hold the same value. In that sense
 //! this routine is similar to an "all reduce".
 //!
+//! The identity element should survive reduction with any other value, returning the original
+//! value unchanged. For example, for integers/foats and plus, the identity element is 0. For
+//! maximum and minimum, the identity values are INT_MIN, and INT_MAX respectively.
+//!
 //! @tparam _CommRange The range of communicators. Each element must model the communicator
 //!         concept.
 //! @tparam _EnvRange The range of execution environments. Each environment supplies the
@@ -216,6 +219,7 @@ void __two_stage_gather_reduction(
 //! @param[out] __outputs The range of output iterators receiving the per-communicator results.
 //! @param[in] __init The initial value seeding each reduction.
 //! @param[in] __op The binary reduction operator.
+//! @param[in] __ident The identity element to be used in case of empty ranges.
 _CCCL_TEMPLATE(class _CommRange,
                class _EnvRange,
                class _InputRangeOfRanges,
@@ -231,7 +235,8 @@ _CCCL_HOST_API void reduce(
   _InputRangeOfRanges&& __range_of_inputs,
   _RangeOfOutputIt&& __outputs,
   _Tp __init     = {},
-  _BinaryOp __op = {})
+  _BinaryOp __op = {},
+  _Tp __ident    = ::cuda::identity_element<_BinaryOp, _Tp>())
 {
   static_assert(::cuda::std::ranges::sized_range<_CommRange>);
 
@@ -256,18 +261,18 @@ _CCCL_HOST_API void reduce(
   {
     __partials.emplace_back(
       ::cuda::experimental::__detail::__reduce::__local_reduction<typename __partial_type::__buffer_type>(
-        /*__ROOT_RANK=*/0, __comm, __env, __inputs, __init, __op));
+        /*__ROOT_RANK=*/0, __comm, __env, __inputs, __init, __ident, __op));
   }
 
   if constexpr (::cuda::experimental::__has_all_reduce<::cuda::std::ranges::range_value_t<_CommRange>,
                                                        typename __properties::__output_type*,
                                                        _BinaryOp>)
   {
-    __direct_reduction(__comms, __outputs, __op, &__partials);
+    ::cuda::experimental::__detail::__reduce::__direct_reduction(__comms, __outputs, __op, &__partials);
   }
   else
   {
-    __two_stage_gather_reduction(__comms, __outputs, __op, &__partials);
+    ::cuda::experimental::__detail::__reduce::__two_stage_gather_reduction(__comms, __outputs, __op, &__partials);
   }
 }
 
@@ -293,6 +298,7 @@ _CCCL_HOST_API void reduce(
 //! @param[out] __output The output iterator receiving the result.
 //! @param[in] __init The initial value seeding the reduction.
 //! @param[in] __op The binary reduction operator.
+//! @param[in] __ident The identity element to be used in case of empty ranges.
 _CCCL_TEMPLATE(class _Comm,
                class _Env,
                class _InputRange,
@@ -301,15 +307,22 @@ _CCCL_TEMPLATE(class _Comm,
                class _BinaryOp = ::cuda::std::plus<>)
 _CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange>
                  _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>)
-_CCCL_HOST_API void
-reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {})
+_CCCL_HOST_API void reduce(
+  _Comm&& __comm,
+  _Env&& __env,
+  _InputRange&& __inputs,
+  _OutputIt __output,
+  _Tp __init     = {},
+  _BinaryOp __op = {},
+  _Tp __ident    = ::cuda::identity_element<_BinaryOp, _Tp>())
 {
   reduce(::cuda::std::span<::cuda::std::remove_reference_t<_Comm>, 1>{::cuda::std::addressof(__comm), 1},
          ::cuda::std::span<::cuda::std::remove_reference_t<_Env>, 1>{::cuda::std::addressof(__env), 1},
          ::cuda::std::span<::cuda::std::remove_reference_t<_InputRange>, 1>{::cuda::std::addressof(__inputs), 1},
          ::cuda::std::span<::cuda::std::remove_reference_t<_OutputIt>, 1>{::cuda::std::addressof(__output), 1},
          ::cuda::std::move(__init),
-         ::cuda::std::move(__op));
+         ::cuda::std::move(__op),
+         ::cuda::std::move(__ident));
 }
 
 //! @brief Reduce a single input range over a single communicator using
@@ -326,6 +339,9 @@ reduce(_Comm&& __comm, _Env&& __env, _InputRange&& __inputs, _OutputIt __output,
 //! @param[in] __comm The communicator.
 //! @param[in] __inputs The input range to reduce.
 //! @param[out] __output The output iterator receiving the result.
+//! @param[in] __init The initial value seeding the reduction.
+//! @param[in] __op The binary reduction operator.
+//! @param[in] __ident The identity element to be used in case of empty ranges.
 _CCCL_TEMPLATE(class _Comm,
                class _InputRange,
                class _OutputIt,
@@ -333,15 +349,21 @@ _CCCL_TEMPLATE(class _Comm,
                class _BinaryOp = ::cuda::std::plus<>)
 _CCCL_REQUIRES(__communicator<_Comm> _CCCL_AND ::cuda::std::ranges::random_access_range<_InputRange>
                  _CCCL_AND ::cuda::std::output_iterator<_OutputIt, _Tp>)
-_CCCL_HOST_API void
-reduce(_Comm&& __comm, _InputRange&& __inputs, _OutputIt __output, _Tp __init = {}, _BinaryOp __op = {})
+_CCCL_HOST_API void reduce(
+  _Comm&& __comm,
+  _InputRange&& __inputs,
+  _OutputIt __output,
+  _Tp __init     = {},
+  _BinaryOp __op = {},
+  _Tp __ident    = ::cuda::identity_element<_BinaryOp, _Tp>())
 {
   reduce(::cuda::std::forward<_Comm>(__comm),
          ::cuda::std::execution::env<>{},
          ::cuda::std::forward<_InputRange>(__inputs),
          ::cuda::std::forward<_OutputIt>(__output),
          ::cuda::std::move(__init),
-         ::cuda::std::move(__op));
+         ::cuda::std::move(__op),
+         ::cuda::std::move(__ident));
 }
 } // namespace cuda::experimental
 
diff --git a/cudax/test/multi_gpu/algorithms/reduce/basic.cu b/cudax/test/multi_gpu/algorithms/reduce/basic.cu
index babffa8ff50..0c5f998bc14 100644
--- a/cudax/test/multi_gpu/algorithms/reduce/basic.cu
+++ b/cudax/test/multi_gpu/algorithms/reduce/basic.cu
@@ -24,6 +24,7 @@
 #include <testing.cuh>
 
 #include "../../communicators/nccl/nccl_test_helpers.cuh"
+#include <c2h/vector.h>
 
 namespace
 {
@@ -39,7 +40,7 @@ T make_value(int i)
 template <>
 custom_value make_value<>(int i)
 {
-  return {static_cast<cuda::std::size_t>(i), static_cast<cuda::std::size_t>(i)};
+  return {{static_cast<cuda::std::size_t>(i), static_cast<cuda::std::size_t>(i)}};
 };
 
 // A custom reduction operator equivalent to `cuda::std::plus<>`.
@@ -52,6 +53,42 @@ struct custom_plus
   }
 };
 
+template <class T, class Op>
+struct identity_value; // Unsupported combinations fail to compile.
+
+template <class T, class... U>
+struct identity_value<T, cuda::std::plus<U...>>
+{
+  [[nodiscard]] static T get()
+  {
+    return make_value<T>(0);
+  }
+};
+
+template <class T>
+struct identity_value<T, custom_plus>
+{
+  [[nodiscard]] static T get()
+  {
+    return make_value<T>(0);
+  }
+};
+
+template <class T, class... U>
+struct identity_value<T, cuda::maximum<U...>>
+{
+  [[nodiscard]] static T get()
+  {
+    return cuda::std::numeric_limits<T>::lowest();
+  }
+};
+
+template <class T, class Op>
+[[nodiscard]] T get_identity()
+{
+  return identity_value<T, Op>::get();
+}
+
 using value_types = c2h::type_list<cuda::std::int32_t, float, custom_value>;
 using operators   = c2h::type_list<::cuda::std::plus<>, ::cuda::maximum<>, custom_plus>;
 
@@ -73,7 +110,8 @@ template <class OutBuffers>
 // Run the full reduction, wait for it to finish, and check that `reduce` left its argument ranges
 // untouched. This boilerplate is identical for every test regardless of how the inputs are shaped.
 template <class Comms, class Envs, class In, class Outputs, class T, class Op, class Streams>
-void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init, Op op, Streams& streams)
+void do_reduce(
+  Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init, const T& ident, Op op, Streams& streams)
 {
   using value_type = typename cuda::std::remove_cvref_t<decltype(in.front())>::value_type;
 
@@ -90,7 +128,7 @@ void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init
 
   const auto outputs_copy = outputs;
 
-  cudax::reduce(comms, envs, in, outputs, init, op);
+  cudax::reduce(comms, envs, in, outputs, init, op, ident);
 
   for (auto& stream : streams)
   {
@@ -103,7 +141,7 @@ void do_reduce(Comms& comms, Envs& envs, In& in, Outputs& outputs, const T& init
     const auto actual = cuda::make_buffer(in[i].stream(), pool, in[i]);
     REQUIRE_THAT(actual, Equals(in_copy[i]));
   }
-  REQUIRE(outputs == outputs_copy);
+  REQUIRE_THAT(outputs, Catch::Matchers::Equals(outputs_copy));
 }
 } // namespace
 
@@ -114,7 +152,8 @@ MGMN_TEST("reduce, one element per rank", value_types, operators)
 
   // Seed each reduction with a few hardcoded initializers. The init participates in the fold the
   // same way on host and device, so any value works for every operator under test.
-  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+  const T init     = make_value<T>(GENERATE(0, 1, -1, 5));
+  const auto ident = get_identity<T, Op>();
 
   auto comms   = this->communicators();
   auto streams = nccl_test_util::make_streams();
@@ -131,7 +170,7 @@ MGMN_TEST("reduce, one element per rank", value_types, operators)
   in.reserve(comms.size());
   out.reserve(comms.size());
   envs.reserve(comms.size());
-  for (int i = 0; i < comms.size(); ++i)
+  for (cuda::std::size_t i = 0; i < comms.size(); ++i)
   {
     const auto values = {make_value<T>(comms[i].rank())};
     in.emplace_back(cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device(), values));
@@ -146,17 +185,15 @@ MGMN_TEST("reduce, one element per rank", value_types, operators)
 
   auto outputs = make_output_iterators(out);
 
-  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+  do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams);
 
   const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{});
 
   for (const auto& buf : out)
   {
-    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
-    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+    const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected);
 
-    REQUIRE(actual.size() == 1);
-    REQUIRE(actual.front() == expected);
+    REQUIRE_THAT(buf, Equals(exp));
   }
 }
 
@@ -167,7 +204,8 @@ MGMN_TEST("reduce, multiple elements per rank", value_types, operators)
 
   // Seed each reduction with a few hardcoded initializers. The init participates in the fold the
   // same way on host and device, so any value works for every operator under test.
-  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+  const T init     = make_value<T>(GENERATE(0, 1, -1, 5));
+  const auto ident = get_identity<T, Op>();
 
   auto comms   = this->communicators();
   auto streams = nccl_test_util::make_streams();
@@ -184,7 +222,7 @@ MGMN_TEST("reduce, multiple elements per rank", value_types, operators)
   in.reserve(comms.size());
   out.reserve(comms.size());
   envs.reserve(comms.size());
-  for (int i = 0; i < comms.size(); ++i)
+  for (cuda::std::size_t i = 0; i < comms.size(); ++i)
   {
     const auto v      = make_value<T>(comms[i].rank());
     const auto values = {v, v, v};
@@ -201,17 +239,15 @@ MGMN_TEST("reduce, multiple elements per rank", value_types, operators)
 
   auto outputs = make_output_iterators(out);
 
-  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+  do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams);
 
   const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{});
 
   for (const auto& buf : out)
   {
-    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
-    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+    const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected);
 
-    REQUIRE(actual.size() == 1);
-    REQUIRE(actual.front() == expected);
+    REQUIRE_THAT(buf, Equals(exp));
   }
 }
 
@@ -220,7 +256,8 @@ MGMN_TEST("reduce, some ranks empty", value_types, operators)
   using T  = c2h::get<0, TestType>;
   using Op = c2h::get<1, TestType>;
 
-  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+  const T init     = make_value<T>(GENERATE(0, 1, -1, 5));
+  const auto ident = get_identity<T, Op>();
 
   auto comms   = this->communicators();
   auto streams = nccl_test_util::make_streams();
@@ -237,7 +274,7 @@ MGMN_TEST("reduce, some ranks empty", value_types, operators)
   in.reserve(comms.size());
   out.reserve(comms.size());
   envs.reserve(comms.size());
-  for (int i = 0; i < comms.size(); ++i)
+  for (cuda::std::size_t i = 0; i < comms.size(); ++i)
   {
     const auto rank = comms[i].rank();
     if (rank % 2 == 0)
@@ -262,17 +299,15 @@ MGMN_TEST("reduce, some ranks empty", value_types, operators)
 
   auto outputs = make_output_iterators(out);
 
-  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+  do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams);
 
   const T expected = std::accumulate(reference.begin(), reference.end(), init, Op{});
 
   for (const auto& buf : out)
   {
-    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
-    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+    const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected);
 
-    REQUIRE(actual.size() == 1);
-    REQUIRE(actual.front() == expected);
+    REQUIRE_THAT(buf, Equals(exp));
   }
 }
 
@@ -281,7 +316,8 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators)
   using T  = c2h::get<0, TestType>;
   using Op = c2h::get<1, TestType>;
 
-  const T init = make_value<T>(GENERATE(0, 1, -1, 5));
+  const T init     = make_value<T>(GENERATE(0, 1, -1, 5));
+  const auto ident = get_identity<T, Op>();
 
   auto comms   = this->communicators();
   auto streams = nccl_test_util::make_streams();
@@ -295,7 +331,7 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators)
   in.reserve(comms.size());
   out.reserve(comms.size());
   envs.reserve(comms.size());
-  for (int i = 0; i < comms.size(); ++i)
+  for (cuda::std::size_t i = 0; i < comms.size(); ++i)
   {
     in.emplace_back(cuda::make_device_buffer<T>(streams[i], comms[i].logical_device().underlying_device()));
     out.emplace_back(
@@ -305,7 +341,7 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators)
 
   auto outputs = make_output_iterators(out);
 
-  do_reduce(comms, envs, in, outputs, init, Op{}, streams);
+  do_reduce(comms, envs, in, outputs, init, ident, Op{}, streams);
 
   // Reducing nothing seeded by `init` yields `init`, exactly like `std::accumulate` over an empty
   // range.
@@ -313,10 +349,8 @@ MGMN_TEST("reduce, all ranks empty", value_types, operators)
 
   for (const auto& buf : out)
   {
-    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
-    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+    const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected);
 
-    REQUIRE(actual.size() == 1);
-    REQUIRE(actual.front() == expected);
+    REQUIRE_THAT(buf, Equals(exp));
   }
 }
diff --git a/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu
index e37ffffbaf2..1c8dbca81dd 100644
--- a/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu
+++ b/cudax/test/multi_gpu/algorithms/reduce/range_defaults.cu
@@ -54,7 +54,7 @@ MGMN_TEST("reduce, range overload with default initializer and default plus", )
   in.reserve(comms.size());
   out.reserve(comms.size());
   envs.reserve(comms.size());
-  for (int i = 0; i < comms.size(); ++i)
+  for (cuda::std::size_t i = 0; i < comms.size(); ++i)
   {
     const auto values = {static_cast<value_type>(comms[i].rank())};
     in.emplace_back(
@@ -81,11 +81,9 @@ MGMN_TEST("reduce, range overload with default initializer and default plus", )
 
   for (const auto& buf : out)
   {
-    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
-    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+    const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected);
 
-    REQUIRE(actual.size() == 1);
-    REQUIRE(actual.front() == expected);
+    REQUIRE_THAT(buf, Equals(exp));
   }
 }
 
@@ -104,7 +102,7 @@ MGMN_TEST("reduce, range overload with explicit initializer and default plus", )
   in.reserve(comms.size());
   out.reserve(comms.size());
   envs.reserve(comms.size());
-  for (int i = 0; i < comms.size(); ++i)
+  for (cuda::std::size_t i = 0; i < comms.size(); ++i)
   {
     const auto values = {static_cast<value_type>(comms[i].rank())};
     in.emplace_back(
@@ -131,10 +129,8 @@ MGMN_TEST("reduce, range overload with explicit initializer and default plus", )
 
   for (const auto& buf : out)
   {
-    auto pool         = cuda::mr::legacy_pinned_memory_resource{};
-    const auto actual = cuda::make_buffer(buf.stream(), pool, buf);
+    const auto exp = cuda::make_buffer(buf.stream(), cuda::mr::legacy_pinned_memory_resource{}, 1, expected);
 
-    REQUIRE(actual.size() == 1);
-    REQUIRE(actual.front() == expected);
+    REQUIRE_THAT(buf, Equals(exp));
   }
 }