@@ -231,6 +231,36 @@ PbTensor::FromNumpy(const std::string& name, py::array& numpy_array)
231231 return std::make_shared<PbTensor>(name, numpy_array);
232232}
233233
234+ DLDeviceType
235+ PbTensor::DeviceType ()
236+ {
237+ DLDeviceType device_type{};
238+
239+ switch (memory_type_) {
240+ case TRITONSERVER_MEMORY_GPU :
241+ device_type = DLDeviceType::kDLCUDA ;
242+ break ;
243+ case TRITONSERVER_MEMORY_CPU :
244+ device_type = DLDeviceType::kDLCPU ;
245+ break ;
246+ case TRITONSERVER_MEMORY_CPU_PINNED :
247+ device_type = DLDeviceType::kDLCUDAHost ;
248+ break ;
249+ }
250+
251+ return device_type;
252+ }
253+
254+ py::capsule
255+ PbTensor::DLPack (const py::object& stream)
256+ {
257+ // Here external tensor requests PbTensor's `__dlpack__` method to provide
258+ // a PyCapsule. By the design of PbTensor, in a GPU case no pending work
259+ // is scheduled to work with PbTensor's data and we can simply pass
260+ // the capsule without a synchronization.
261+ return this ->ToDLPack ();
262+ }
263+
234264py::capsule
235265PbTensor::ToDLPack ()
236266{
@@ -269,23 +299,19 @@ PbTensor::ToDLPack()
269299 tensor_handle.inc_ref ();
270300
271301 dlpack_tensor->dl_tensor .device .device_id = memory_type_id_;
302+ dlpack_tensor->dl_tensor .device .device_type = this ->DeviceType ();
272303 dlpack_tensor->dl_tensor .dtype = triton_to_dlpack_type (dtype_);
273304
274- switch (memory_type_) {
275- case TRITONSERVER_MEMORY_GPU :
276- dlpack_tensor->dl_tensor .device .device_type = DLDeviceType::kDLCUDA ;
277- break ;
278- case TRITONSERVER_MEMORY_CPU :
279- dlpack_tensor->dl_tensor .device .device_type = DLDeviceType::kDLCPU ;
280- break ;
281- case TRITONSERVER_MEMORY_CPU_PINNED :
282- dlpack_tensor->dl_tensor .device .device_type = DLDeviceType::kDLCUDAHost ;
283- break ;
284- }
285-
286305 return py::capsule (
287306 static_cast <void *>(dlpack_tensor), " dltensor" , &delete_unused_dltensor);
288307}
308+
309+ std::pair<int32_t , int64_t >
310+ PbTensor::DLPackDevice ()
311+ {
312+ return std::pair<int32_t , int64_t >(this ->DeviceType (), memory_type_id_);
313+ }
314+
289315#endif // TRITON_PB_STUB
290316
291317void
@@ -305,12 +331,100 @@ PbTensor::Memory()
305331
306332#ifdef TRITON_PB_STUB
307333std::shared_ptr<PbTensor>
308- PbTensor::FromDLPack (const std::string& name, const py::capsule& dlpack_tensor )
334+ PbTensor::FromDLPack (const std::string& name, const py::object& tensor )
309335{
310336 if (name == " " ) {
311337 throw PythonBackendException (" Tensor name cannot be an empty string." );
312338 }
339+ if (py::isinstance<py::capsule>(tensor)) {
340+ return FromDLPackCapsule (name, tensor);
341+ }
342+
343+ if (!py::hasattr (tensor, " __dlpack__" ) ||
344+ !py::hasattr (tensor, " __dlpack_device__" )) {
345+ throw PythonBackendException (
346+ " Provided tensor is not supported. Tensor must be a DLPack capsule \
347+ or have `__dlpack__` and `__dlpack_device__` attributes" );
348+ }
349+
350+ auto capsule_device_info =
351+ tensor.attr (" __dlpack_device__" )().cast <std::pair<int32_t , int64_t >>();
352+ if (capsule_device_info.first == DLDeviceType::kDLCUDA ) {
353+ #ifdef TRITON_ENABLE_GPU
354+ int current_device;
355+ cudaError_t err = cudaGetDevice (¤t_device);
356+ if (err != cudaSuccess) {
357+ throw PythonBackendException (" Failed to get current CUDA device id." );
358+ }
359+
360+ bool overridden = (current_device != capsule_device_info.second );
361+ err = overridden ? cudaSetDevice (capsule_device_info.second ) : cudaSuccess;
362+ if (err != cudaSuccess) {
363+ throw PythonBackendException (
364+ " Failed to set CUDA device to device with id " +
365+ std::to_string (capsule_device_info.second ));
366+ }
367+ // In case there is a pending job on the data, where this capsule
368+ // is pointing to, we need to wait for it before consuming.
369+ // This is important for when data is located on different
370+ // context (GPU) and work is done on the default stream.
371+ // For this scenario, __dlpack__ implementation may skip
372+ // syncronization (since the work is on the default stream)
373+ // and we will return pointer to the data on different GPU too early
374+ // (i.e. before pending work is done). Thus we sync on the default stream
375+ // only in the case we switched to a different context.
376+ err = overridden ? cudaStreamSynchronize (0 ) : cudaSuccess;
377+ if (err != cudaSuccess) {
378+ throw PythonBackendException (
379+ " Failed to synchronize CUDA device with id " +
380+ std::to_string (
381+ overridden ? capsule_device_info.second : current_device));
382+ }
383+
384+ // Array API requirements for the stream argument:
385+ // stream = 1 the legacy default stream (in this case should
386+ // synchronize on CUDA stream 0)
387+ // For CPU, `stream=None` is the only accepted argument
388+ // according to array API. For GPU, when `stream=None` producer
389+ // must assume the legacy default stream. Reference:
390+ // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
391+ auto ptr_to_tensor = FromDLPackCapsule (
392+ name, tensor.attr (" __dlpack__" )(py::arg (" stream" ) = py::int_ (1 )));
393+
394+ err = overridden ? cudaSetDevice (current_device) : cudaSuccess;
395+ if (err != cudaSuccess) {
396+ throw PythonBackendException (
397+ " Failed to set CUDA device back to initial compute device "
398+ " with id " +
399+ std::to_string (current_device));
400+ }
401+ return ptr_to_tensor;
402+ #else
403+ throw PythonBackendException (
404+ " DLPack capsule passed pointer to memory allocated on GPU device, \
405+ when GPU is not available" );
406+ #endif
407+ } else if (
408+ capsule_device_info.first != DLDeviceType::kDLCPU &&
409+ capsule_device_info.first != DLDeviceType::kDLCUDAHost ) {
410+ throw PythonBackendException (
411+ " DLDevice type " + std::to_string (capsule_device_info.first ) +
412+ " is not support by Python backend." );
413+ }
414+
415+ // If data is located on CPU, `stream=None` is the only accepted argument
416+ // according to array API. For GPU, when `stream=None` producer must
417+ // assume the legacy default stream.
418+ // Reference:
419+ // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
420+ return FromDLPackCapsule (
421+ name, tensor.attr (" __dlpack__" )(py::arg (" stream" ) = py::none ()));
422+ }
313423
424+ std::shared_ptr<PbTensor>
425+ PbTensor::FromDLPackCapsule (
426+ const std::string& name, const py::capsule& dlpack_tensor)
427+ {
314428 DLManagedTensor* dl_managed_tensor =
315429 static_cast <DLManagedTensor*>(dlpack_tensor.get_pointer ());
316430
0 commit comments