-
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Expand file tree
/
Copy pathtransform.hpp
More file actions
126 lines (98 loc) · 3.31 KB
/
Copy pathtransform.hpp
File metadata and controls
126 lines (98 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#pragma once
#include "../cudaflow.hpp"
/**
@file taskflow/cuda/algorithm/transform.hpp
@brief cuda parallel-transform algorithms include file
*/
namespace tf {
// ----------------------------------------------------------------------------
// transform
// ----------------------------------------------------------------------------
namespace detail {
/**
@private
*/
template <typename I, typename O, typename C, typename E>
__global__ void cuda_transform_kernel(I first, unsigned count, O output, C op) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
auto tile = cuda_get_tile(bid, E::nv, count);
cuda_strided_iterate<E::nt, E::vt>(
[=]__device__(auto, auto j) {
auto offset = j + tile.begin;
*(output + offset) = op(*(first+offset));
},
tid,
tile.count()
);
}
/**
@private
*/
template <typename I1, typename I2, typename O, typename C, typename E>
__global__ void cuda_transform_kernel(
I1 first1, I2 first2, unsigned count, O output, C op
) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
auto tile = cuda_get_tile(bid, E::nv, count);
cuda_strided_iterate<E::nt, E::vt>(
[=]__device__(auto, auto j) {
auto offset = j + tile.begin;
*(output + offset) = op(*(first1+offset), *(first2+offset));
},
tid,
tile.count()
);
}
} // end of namespace detail -------------------------------------------------
// ----------------------------------------------------------------------------
// cudaFlow
// ----------------------------------------------------------------------------
// Function: transform
template <typename Creator, typename Deleter>
template <typename I, typename O, typename C, typename E>
cudaTask cudaGraphBase<Creator, Deleter>::transform(I first, I last, O output, C c) {
unsigned count = std::distance(first, last);
return kernel(
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<I, O, C, E>,
first, count, output, c
);
}
// Function: transform
template <typename Creator, typename Deleter>
template <typename I1, typename I2, typename O, typename C, typename E>
cudaTask cudaGraphBase<Creator, Deleter>::transform(I1 first1, I1 last1, I2 first2, O output, C c) {
unsigned count = std::distance(first1, last1);
return kernel(
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<I1, I2, O, C, E>,
first1, first2, count, output, c
);
}
// Function: update transform
template <typename Creator, typename Deleter>
template <typename I, typename O, typename C, typename E>
void cudaGraphExecBase<Creator, Deleter>::transform(cudaTask task, I first, I last, O output, C c) {
unsigned count = std::distance(first, last);
kernel(task,
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<I, O, C, E>,
first, count, output, c
);
}
// Function: update transform
template <typename Creator, typename Deleter>
template <typename I1, typename I2, typename O, typename C, typename E>
void cudaGraphExecBase<Creator, Deleter>::transform(
cudaTask task, I1 first1, I1 last1, I2 first2, O output, C c
) {
unsigned count = std::distance(first1, last1);
kernel(task,
E::num_blocks(count), E::nt, 0,
detail::cuda_transform_kernel<I1, I2, O, C, E>,
first1, first2, count, output, c
);
}
} // end of namespace tf -----------------------------------------------------