taskflow/taskflow/cuda/algorithm/find.hpp at task_isolation · ModuleWorks/taskflow

294 lines (236 loc) · 7.36 KB
#pragma once
#include "for_each.hpp"
#include "reduce.hpp"
@file taskflow/cuda/algorithm/find.hpp
@brief cuda find algorithms include file
namespace tf::detail {
/** @private */
template <typename T>
struct cudaFindPair {
  unsigned index;
  __device__ operator unsigned () const { return index; }
/** @private */
template <typename P, typename I, typename U>
void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) {
  if(count == 0) {
    cuda_single_task(p, [=] __device__ () { *idx = 0; });
    return;
  using E = std::decay_t<P>;
  auto B = (count + E::nv - 1) / E::nv;
  // set the index to the maximum
  cuda_single_task(p, [=] __device__ () { *idx = count; });
  // launch the kernel to atomic-find the minimum
  cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
    __shared__ unsigned shm_id;
    if(!tid) {
      shm_id = count;
    __syncthreads();
    auto tile = cuda_get_tile(bid, E::nv, count);
    auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
      input + tile.begin, tid, tile.count()
    auto id = count;
    for(unsigned i=0; i<E::vt; i++) {
      auto j = E::nt*i + tid;
      if(j < tile.count() && pred(x[i])) {
        id = j + tile.begin;
        break;
    // Note: the reduce version is not faster though
    // reduce to a scalar per block.
    //__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm;
    //id = cudaBlockReduce<E::nt, unsigned>()(
    //  tid,
    //  id,
    //  shm,
    //  (tile.count() < E::nt ? tile.count() : E::nt),
    //  cuda_minimum<unsigned>{},
    //  false
    // only need the minimum id
    atomicMin(&shm_id, id);
    __syncthreads();
    // reduce all to the global memory
    if(!tid) {
      atomicMin(idx, shm_id);
      //atomicMin(idx, id);
/** @private */
template <typename P, typename I, typename O>
void cuda_min_element_loop(
  P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
  if(count == 0) {
    cuda_single_task(p, [=] __device__ () { *idx = 0; });
    return;
  using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
  cuda_uninitialized_reduce_loop(p,
    cuda_make_load_iterator<T>([=]__device__(auto i){
      return T{*(input+i), i};
    [=] __device__ (const auto& a, const auto& b) {
      return op(a.key, b.key) ? a : b;
/** @private */
template <typename P, typename I, typename O>
void cuda_max_element_loop(
  P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
  if(count == 0) {
    cuda_single_task(p, [=] __device__ () { *idx = 0; });
    return;
  using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
  cuda_uninitialized_reduce_loop(p,
    cuda_make_load_iterator<T>([=]__device__(auto i){
      return T{*(input+i), i};
    [=] __device__ (const auto& a, const auto& b) {
      return op(a.key, b.key) ? b : a;
}  // end of namespace tf::detail ---------------------------------------------
namespace tf {
// ----------------------------------------------------------------------------
// cuda_find_if
// ----------------------------------------------------------------------------
@brief finds the index of the first element that satisfies the given criteria
@tparam P execution policy type
@tparam I input iterator type
@tparam U unary operator type
@param p execution policy
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param idx pointer to the index of the found element
@param op unary operator which returns @c true for the required element
The function launches kernels asynchronously to find the index @c idx of the
first element in the range <tt>[first, last)</tt>
such that <tt>op(*(first+idx))</tt> is true.
This is equivalent to the parallel execution of the following loop:
@code{.cpp}
unsigned idx = 0;
for(; first != last; ++first, ++idx) {
  if (p(*first)) {
    return idx;
return idx;
template <typename P, typename I, typename U>
void cuda_find_if(
  P&& p, I first, I last, unsigned* idx, U op
  detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op);
// ----------------------------------------------------------------------------
// cuda_min_element
// ----------------------------------------------------------------------------
// Function: min-element_bufsz
template <unsigned NT, unsigned VT>  
template <typename T>
unsigned cudaExecutionPolicy<NT, VT>::min_element_bufsz(unsigned count) {
  return reduce_bufsz<detail::cudaFindPair<T>>(count);
@brief finds the index of the minimum element in a range
@tparam P execution policy type
@tparam I input iterator type
@tparam O comparator type
@param p execution policy object
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param idx solution index of the minimum element
@param op comparison function object
@param buf pointer to the buffer
The function launches kernels asynchronously to find
the smallest element in the range <tt>[first, last)</tt>
using the given comparator @c op.
You need to provide a buffer that holds at least
tf::cuda_min_element_bufsz bytes for internal use.
The function is equivalent to a parallel execution of the following loop:
@code{.cpp}
if(first == last) {
  return 0;
auto smallest = first;
for (++first; first != last; ++first) {
  if (op(*first, *smallest)) {
    smallest = first;
return std::distance(first, smallest);
template <typename P, typename I, typename O>
void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
  detail::cuda_min_element_loop(
    p, first, std::distance(first, last), idx, op, buf
// ----------------------------------------------------------------------------
// cuda_max_element
// ----------------------------------------------------------------------------
// Function: max_element_bufsz
template <unsigned NT, unsigned VT>  
template <typename T>
unsigned cudaExecutionPolicy<NT, VT>::max_element_bufsz(unsigned count) {
  return reduce_bufsz<detail::cudaFindPair<T>>(count);
@brief finds the index of the maximum element in a range
@tparam P execution policy type
@tparam I input iterator type
@tparam O comparator type
@param p execution policy object
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param idx solution index of the maximum element
@param op comparison function object
@param buf pointer to the buffer
The function launches kernels asynchronously to find
the largest element in the range <tt>[first, last)</tt>
using the given comparator @c op.
You need to provide a buffer that holds at least
tf::cuda_max_element_bufsz bytes for internal use.
The function is equivalent to a parallel execution of the following loop:
@code{.cpp}
if(first == last) {
  return 0;
auto largest = first;
for (++first; first != last; ++first) {
  if (op(*largest, *first)) {
    largest = first;
return std::distance(first, largest);
template <typename P, typename I, typename O>
void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
  detail::cuda_max_element_loop(
    p, first, std::distance(first, last), idx, op, buf
}  // end of namespace tf -----------------------------------------------------
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

find.hpp

Latest commit

History

find.hpp

File metadata and controls