Update dlpack implementation for PbTensor (triton-inference-server#223)

oandreeva-nv · web-flow · commit 6c4b81711368 · 2023-05-08T16:20:40.000-07:00
* Update dlpack implementation for PbTensor: handle new API + bools
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -89,7 +89,7 @@ FetchContent_MakeAvailable(pybind11)
 FetchContent_Declare(
   dlpack
   GIT_REPOSITORY "https://github.com/dmlc/dlpack"
-  GIT_TAG "v0.7"
+  GIT_TAG "v0.8"
   GIT_SHALLOW ON
 )
 FetchContent_MakeAvailable(dlpack)
diff --git a/README.md b/README.md
@@ -1226,6 +1226,15 @@ class TritonPythonModel:
     # tensor.
     input0 = pb_utils.Tensor.from_dlpack("INPUT0", to_dlpack(pytorch_tensor))
 ```
+Python backend allows tensors implementing
+[`__dlpack__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack__.html) 
+and [`__dlpack_device__`](https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack_device__.html) 
+[interface](https://dmlc.github.io/dlpack/latest/python_spec.html) 
+to be converted to Python backend tensors. For instance:
+
+```python
+input0 = pb_utils.Tensor.from_dlpack("INPUT0", pytorch_tensor)
+```
 
 This method only supports contiguous Tensors that are in C-order. If the tensor
 is not C-order contiguous an exception will be raised.
diff --git a/src/pb_stub.cc b/src/pb_stub.cc
@@ -1488,7 +1488,9 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module)
       .def("to_dlpack", &PbTensor::ToDLPack)
       .def("is_cpu", &PbTensor::IsCPU)
       .def("shape", &PbTensor::Dims)
-      .def("from_dlpack", &PbTensor::FromDLPack);
+      .def("from_dlpack", &PbTensor::FromDLPack)
+      .def("__dlpack__", &PbTensor::DLPack, py::arg("stream") = py::none())
+      .def("__dlpack_device__", &PbTensor::DLPackDevice);
 
   py::class_<InferResponse, std::shared_ptr<InferResponse>>(
       module, "InferenceResponse")
diff --git a/src/pb_stub_utils.cc b/src/pb_stub_utils.cc
@@ -1,4 +1,4 @@
-// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -189,8 +189,8 @@ triton_to_dlpack_type(TRITONSERVER_DataType triton_dtype)
   dl_dtype.lanes = 1;
   switch (triton_dtype) {
     case TRITONSERVER_TYPE_BOOL:
-      dl_code = DLDataTypeCode::kDLInt;
-      dt_size = 1;
+      dl_code = DLDataTypeCode::kDLBool;
+      dt_size = 8;
       break;
     case TRITONSERVER_TYPE_UINT8:
       dl_code = DLDataTypeCode::kDLUInt;
@@ -279,8 +279,6 @@ dlpack_to_triton_type(const DLDataType& data_type)
       return TRITONSERVER_TYPE_INT32;
     } else if (data_type.bits == 64) {
       return TRITONSERVER_TYPE_INT64;
-    } else if (data_type.bits == 1) {
-      return TRITONSERVER_TYPE_BOOL;
     }
   }
 
@@ -296,6 +294,12 @@ dlpack_to_triton_type(const DLDataType& data_type)
     }
   }
 
+  if (data_type.code == DLDataTypeCode::kDLBool) {
+    if (data_type.bits == 8) {
+      return TRITONSERVER_TYPE_BOOL;
+    }
+  }
+
   return TRITONSERVER_TYPE_INVALID;
 }
 }}}  // namespace triton::backend::python
diff --git a/src/pb_tensor.cc b/src/pb_tensor.cc
@@ -231,6 +231,36 @@ PbTensor::FromNumpy(const std::string& name, py::array& numpy_array)
   return std::make_shared<PbTensor>(name, numpy_array);
 }
 
+DLDeviceType
+PbTensor::DeviceType()
+{
+  DLDeviceType device_type{};
+
+  switch (memory_type_) {
+    case TRITONSERVER_MEMORY_GPU:
+      device_type = DLDeviceType::kDLCUDA;
+      break;
+    case TRITONSERVER_MEMORY_CPU:
+      device_type = DLDeviceType::kDLCPU;
+      break;
+    case TRITONSERVER_MEMORY_CPU_PINNED:
+      device_type = DLDeviceType::kDLCUDAHost;
+      break;
+  }
+
+  return device_type;
+}
+
+py::capsule
+PbTensor::DLPack(const py::object& stream)
+{
+  // Here external tensor requests PbTensor's `__dlpack__` method to provide
+  // a PyCapsule. By the design of PbTensor, in a GPU case no pending work
+  // is scheduled to work with PbTensor's data and we can simply pass
+  // the capsule without a synchronization.
+  return this->ToDLPack();
+}
+
 py::capsule
 PbTensor::ToDLPack()
 {
@@ -269,23 +299,19 @@ PbTensor::ToDLPack()
   tensor_handle.inc_ref();
 
   dlpack_tensor->dl_tensor.device.device_id = memory_type_id_;
+  dlpack_tensor->dl_tensor.device.device_type = this->DeviceType();
   dlpack_tensor->dl_tensor.dtype = triton_to_dlpack_type(dtype_);
 
-  switch (memory_type_) {
-    case TRITONSERVER_MEMORY_GPU:
-      dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDA;
-      break;
-    case TRITONSERVER_MEMORY_CPU:
-      dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCPU;
-      break;
-    case TRITONSERVER_MEMORY_CPU_PINNED:
-      dlpack_tensor->dl_tensor.device.device_type = DLDeviceType::kDLCUDAHost;
-      break;
-  }
-
   return py::capsule(
       static_cast<void*>(dlpack_tensor), "dltensor", &delete_unused_dltensor);
 }
+
+std::pair<int32_t, int64_t>
+PbTensor::DLPackDevice()
+{
+  return std::pair<int32_t, int64_t>(this->DeviceType(), memory_type_id_);
+}
+
 #endif  // TRITON_PB_STUB
 
 void
@@ -305,12 +331,100 @@ PbTensor::Memory()
 
 #ifdef TRITON_PB_STUB
 std::shared_ptr<PbTensor>
-PbTensor::FromDLPack(const std::string& name, const py::capsule& dlpack_tensor)
+PbTensor::FromDLPack(const std::string& name, const py::object& tensor)
 {
   if (name == "") {
     throw PythonBackendException("Tensor name cannot be an empty string.");
   }
+  if (py::isinstance<py::capsule>(tensor)) {
+    return FromDLPackCapsule(name, tensor);
+  }
+
+  if (!py::hasattr(tensor, "__dlpack__") ||
+      !py::hasattr(tensor, "__dlpack_device__")) {
+    throw PythonBackendException(
+        "Provided tensor is not supported. Tensor must be a DLPack capsule \
+        or have `__dlpack__` and `__dlpack_device__` attributes");
+  }
+
+  auto capsule_device_info =
+      tensor.attr("__dlpack_device__")().cast<std::pair<int32_t, int64_t>>();
+  if (capsule_device_info.first == DLDeviceType::kDLCUDA) {
+#ifdef TRITON_ENABLE_GPU
+    int current_device;
+    cudaError_t err = cudaGetDevice(&current_device);
+    if (err != cudaSuccess) {
+      throw PythonBackendException("Failed to get current CUDA device id.");
+    }
+
+    bool overridden = (current_device != capsule_device_info.second);
+    err = overridden ? cudaSetDevice(capsule_device_info.second) : cudaSuccess;
+    if (err != cudaSuccess) {
+      throw PythonBackendException(
+          "Failed to set CUDA device to device with id " +
+          std::to_string(capsule_device_info.second));
+    }
+    // In case there is a pending job on the data, where this capsule
+    // is pointing to, we need to wait for it before consuming.
+    // This is important for when data is located on different
+    // context (GPU) and work is done on the default stream.
+    // For this scenario, __dlpack__ implementation may skip 
+    // syncronization (since the work is on the default stream)
+    // and we will return pointer to the data on different GPU too early 
+    // (i.e. before pending work is done). Thus we sync on the default stream
+    // only in the case we switched to a different context.
+    err = overridden ? cudaStreamSynchronize(0) : cudaSuccess;
+    if (err != cudaSuccess) {
+      throw PythonBackendException(
+          "Failed to synchronize CUDA device with id " +
+          std::to_string(
+              overridden ? capsule_device_info.second : current_device));
+    }
+
+    // Array API requirements for the stream argument:
+    // stream = 1 the legacy default stream (in this case should
+    // synchronize on CUDA stream 0)
+    // For CPU, `stream=None` is the only accepted argument
+    // according to array API. For GPU, when `stream=None`  producer
+    // must assume the legacy default stream. Reference:
+    // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
+    auto ptr_to_tensor = FromDLPackCapsule(
+        name, tensor.attr("__dlpack__")(py::arg("stream") = py::int_(1)));
+
+    err = overridden ? cudaSetDevice(current_device) : cudaSuccess;
+    if (err != cudaSuccess) {
+      throw PythonBackendException(
+          "Failed to set CUDA device back to initial compute device "
+          "with id " +
+          std::to_string(current_device));
+    }
+    return ptr_to_tensor;
+#else
+    throw PythonBackendException(
+        "DLPack capsule passed pointer to memory allocated on GPU device, \
+          when GPU is not available");
+#endif
+  } else if (
+      capsule_device_info.first != DLDeviceType::kDLCPU &&
+      capsule_device_info.first != DLDeviceType::kDLCUDAHost) {
+    throw PythonBackendException(
+        "DLDevice type " + std::to_string(capsule_device_info.first) +
+        " is not support by Python backend.");
+  }
+
+  // If data is located on CPU, `stream=None` is the only accepted argument
+  // according to array API. For GPU, when `stream=None`  producer must
+  // assume the legacy default stream.
+  // Reference:
+  // https://data-apis.org/array-api/latest/API_specification/generated/array_api.array.__dlpack__.html
+  return FromDLPackCapsule(
+      name, tensor.attr("__dlpack__")(py::arg("stream") = py::none()));
+}
 
+std::shared_ptr<PbTensor>
+PbTensor::FromDLPackCapsule(
+    const std::string& name, const py::capsule& dlpack_tensor)
+{
   DLManagedTensor* dl_managed_tensor =
       static_cast<DLManagedTensor*>(dlpack_tensor.get_pointer());
 
diff --git a/src/pb_tensor.h b/src/pb_tensor.h
@@ -1,4 +1,4 @@
-// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -112,11 +112,16 @@ class PbTensor {
   DISALLOW_COPY_AND_ASSIGN(PbTensor);
 
 #ifdef TRITON_PB_STUB
-  /// Construct a Python backend tensor using a DLPack
-  /// capsule.
+  /// Construct a Python backend tensor from an
+  /// external tensor.
   /// \param dlpack source dlpack tensor
   /// \param name name of the tensor
   static std::shared_ptr<PbTensor> FromDLPack(
+      const std::string& name, const py::object& dlpack);
+
+  /// Construct a Python backend tensor using a DLPack
+  /// capsule.
+  static std::shared_ptr<PbTensor> FromDLPackCapsule(
       const std::string& name, const py::capsule& dlpack);
 
   /// Construct a Python backend tensor using a NumPy object.
@@ -125,9 +130,23 @@ class PbTensor {
   static std::shared_ptr<PbTensor> FromNumpy(
       const std::string& name, py::array& numpy_array);
 
+  /// Get device type in DLPack format.
+  DLDeviceType DeviceType();
+
+  /// Exports tensor for consumption by `from_dlpack()` as a DLPack capsule.
+  /// \param stream  a Python integer representing a pointer to a stream,
+  ///                on devices that support streams
+  /// \return Capsule object containing pointer to a DLPack object.
+  py::capsule DLPack(const py::object& stream);
+
   /// Get a PyCapsule object containing the DLPack representation of the tensor.
   /// \return Capsule object containing pointer to a DLPack object.
   py::capsule ToDLPack();
+
+  /// Returns device type and device ID.
+  /// Meant for use within `from_dlpack()`.
+  /// \return a pair (device_type, device_id).
+  std::pair<int32_t, int64_t> DLPackDevice();
 #endif
 
   /// Get the name of the tensor

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ FetchContent_MakeAvailable(pybind11)`
`89`	`89`	`FetchContent_Declare(`
`90`	`90`	`dlpack`
`91`	`91`	`GIT_REPOSITORY "https://github.com/dmlc/dlpack"`
`92`		`- GIT_TAG "v0.7"`
	`92`	`+ GIT_TAG "v0.8"`
`93`	`93`	`GIT_SHALLOW ON`
`94`	`94`	`)`
`95`	`95`	`FetchContent_MakeAvailable(dlpack)`