pcoder93
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 7 additions & 5 deletions b/‎README.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎src/infer_request.cc‎
Lines changed: 11 additions & 0 deletions b/‎src/infer_request.cc‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/memory_manager.cc‎
Lines changed: 119 additions & 0 deletions b/‎src/memory_manager.cc‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎src/memory_manager.h‎
Lines changed: 81 additions & 0 deletions b/‎src/memory_manager.h‎
Lines changed: 81 additions & 0 deletions
@@ -132,7 +132,6 @@ set(
   src/infer_response.h
   src/infer_request.cc
   src/infer_request.h
-  src/message_queue.cc
   src/message_queue.h
   src/ipc_message.cc
   src/ipc_message.h
@@ -162,6 +161,8 @@ set(
   src/pb_env.h
   src/pb_metric_reporter.cc
   src/pb_metric_reporter.h
+  src/memory_manager.cc
+  src/memory_manager.h
   src/request_executor.cc
   src/request_executor.h
 )
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -679,6 +679,12 @@ class TritonPythonModel:
 A complete example for sync and async BLS in Python backend is included in the
 [Examples](#examples) section.
 
+Starting from the 22.04 release, the lifetime of the BLS output tensors have
+been improved such that if a tensor is no longer needed in your Python model it
+will be automatically deallocated. This can increase the number of BLS requests
+that you can execute in your model without running into the out of GPU or shared
+memory error.
+
 Note: Async BLS is not supported on Python 3.6 or lower due to the `async` keyword
 and `asyncio.run` being introduced in Python 3.7. 
 
@@ -710,10 +716,6 @@ flags = pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START | pb_utils.TRITONSERVE
 
 ## Limitations
 
-- The number of inference requests that can be executed as a part of your model
-execution is limited to the amount of shared memory available to the Triton
-server.  If you are using Docker to start the TritonServer, you can control the
-shared memory usage using the
 [`--shm-size`](https://docs.docker.com/engine/reference/run/) flag.
 - You need to make sure that the inference requests performed as a part of your model
 do not create a circular dependency. For example, if model A performs an inference request
 
@@ -423,6 +423,17 @@ InferRequest::Exec()
     std::unique_ptr<InferResponse> infer_response =
         InferResponse::LoadFromSharedMemory(
             shm_pool, *response_handle, true /* open cuda handle */);
+    auto& memory_manager_message_queue = stub->MemoryManagerQueue();
+
+    for (auto& output_tensor : infer_response->OutputTensors()) {
+      if (!output_tensor->IsCPU()) {
+        uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId();
+        output_tensor->Memory()->SetMemoryReleaseCallback(
+            [&memory_manager_message_queue, memory_release_id]() {
+              memory_manager_message_queue->Push(memory_release_id);
+            });
+      }
+    }
 
     return infer_response;
   } else {
 
@@ -0,0 +1,119 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "memory_manager.h"
+
+
+namespace triton { namespace backend { namespace python {
+
+
+#ifdef TRITON_ENABLE_GPU
+GPUMemoryRecord::GPUMemoryRecord(void* ptr)
+{
+  ptr_ = ptr;
+  release_callback_ = [](void* ptr) {
+    cudaError_t err = cudaFree(ptr);
+    if (err != cudaSuccess) {
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_ERROR,
+          (std::string("Failed to free the allocated cuda memory. error: ") +
+           cudaGetErrorString(err))
+              .c_str());
+    }
+  };
+}
+
+void*
+GPUMemoryRecord::MemoryId()
+{
+  return ptr_;
+}
+
+const std::function<void(void*)>&
+GPUMemoryRecord::ReleaseCallback()
+{
+  return release_callback_;
+}
+#endif
+
+MemoryManager::MemoryManager(
+    std::unique_ptr<MessageQueue<uint64_t>>&& memory_message_queue)
+{
+  message_queue_ = std::move(memory_message_queue);
+  record_count_ = 0;
+  thread_ = std::thread(&MemoryManager::QueueMonitorThread, this);
+}
+
+uint64_t
+MemoryManager::AddRecord(std::unique_ptr<MemoryRecord>&& memory_record)
+{
+  std::lock_guard<std::mutex> lock{mu_};
+
+  record_count_++;
+  records_.emplace(record_count_, std::move(memory_record));
+
+  return record_count_;
+}
+
+void
+MemoryManager::QueueMonitorThread()
+{
+  while (true) {
+    uint64_t memory = message_queue_->Pop();
+    if (memory == 0) {
+      return;
+    }
+
+    {
+      std::lock_guard<std::mutex> lock{mu_};
+      auto it = records_.find(memory);
+      if (it == records_.end()) {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_ERROR,
+            "Unexpected memory index received for deallocation.");
+        continue;
+      }
+
+      // Call the release callback.
+      it->second->ReleaseCallback()(it->second->MemoryId());
+      records_.erase(it);
+
+      // Reset the record_count_ when the number of records is zero.
+      if (records_.size() == 0)
+        record_count_ = 0;
+    }
+  }
+}
+
+MemoryManager::~MemoryManager()
+{
+  // Push a dummy 0 message that will trigger the destruction of the background
+  // thread.
+  message_queue_->Push(0);
+  thread_.join();
+}
+
+}}};  // namespace triton::backend::python
@@ -0,0 +1,81 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include "message_queue.h"
+#include "triton/backend/backend_common.h"
+#include "triton/core/tritonserver.h"
+
+#ifdef TRITON_ENABLE_GPU
+#include <cuda_runtime_api.h>
+#endif  // TRITON_ENABLE_GPU
+
+
+namespace triton { namespace backend { namespace python {
+
+class MemoryRecord {
+ public:
+  virtual const std::function<void(void*)>& ReleaseCallback() = 0;
+  virtual void* MemoryId() = 0;
+};
+
+#ifdef TRITON_ENABLE_GPU
+class GPUMemoryRecord : public MemoryRecord {
+ public:
+  GPUMemoryRecord(void* ptr);
+  const std::function<void(void*)>& ReleaseCallback() override;
+  void* MemoryId() override;
+
+ private:
+  void* ptr_;
+  std::function<void(void*)> release_callback_;
+};
+#endif
+
+/// Memory manager class is used primarily for managing the lifetime of GPU
+/// tensors in BLS. It mainly consists of a background thread that monitors a
+/// message queue in shared memory. Whenever a GPU tensor is created, it will
+/// be pushed to the memory manager. The stub process must send a message to the
+/// message queue asking the memory manager to deallocate the GPU tensor.
+class MemoryManager {
+ public:
+  MemoryManager(std::unique_ptr<MessageQueue<uint64_t>>&& memory_message_queue);
+  uint64_t AddRecord(std::unique_ptr<MemoryRecord>&& memory_record);
+  TRITONSERVER_Error* ResetCounter();
+  ~MemoryManager();
+
+ private:
+  std::thread thread_;
+  std::unordered_map<uint64_t, std::unique_ptr<MemoryRecord>> records_;
+  uint64_t record_count_;
+  std::unique_ptr<MessageQueue<uint64_t>> message_queue_;
+  void QueueMonitorThread();
+  std::mutex mu_;
+};
+}}};  // namespace triton::backend::python