@@ -49,8 +49,8 @@ That is, the callable that describes a %cudaFlowCapturer
4949will be executed sequentially.
5050Inside a %cudaFlow capturer task, different GPU tasks (tf::cudaTask) may run
5151in parallel depending on the selected optimization algorithm.
52- By default, we use tf::cudaSequentialCapturing to generate a sequential
53- CUDA graph.
52+ By default, we use tf::cudaRoundRobinCapturing to transform a user-level
53+ graph into a native CUDA graph.
5454
5555Please refer to @ref GPUTaskingcudaFlowCapturer for details.
5656*/
@@ -125,6 +125,11 @@ class cudaFlowCapturer {
125125 a user-described %cudaFlow:
126126 + tf::cudaSequentialCapturing
127127 + tf::cudaRoundRobinCapturing
128+ + tf::cudaLinearCapturing
129+
130+ By default, tf::cudaFlowCapturer uses the round-robin optimization
131+ algorithm with four streams to transform a user-level graph into
132+ a native CUDA graph.
128133 */
129134 template <typename OPT , typename ... ArgsT>
130135 OPT & make_optimizer (ArgsT&&... args);
@@ -1126,6 +1131,7 @@ inline cudaTask cudaFlowCapturer::memcpy(
11261131 });
11271132}
11281133
1134+ // Function: copy
11291135template <typename T, std::enable_if_t <!std::is_same_v<T, void >, void >*>
11301136cudaTask cudaFlowCapturer::copy (T* tgt, const T* src, size_t num) {
11311137 return on ([tgt, src, num] (cudaStream_t stream) mutable {
0 commit comments