Skip to content

Commit 0e6c78a

Browse files
authored
Improve the lifetime of BLS output tensors (triton-inference-server#131)
* Improve the lifetime of BLS output tensors * Add a note about the improved lifecycle of BLS output tensors * Fix stub shutdown if there is an error during initialization * Fix CPU only build * Fix memory manager destruction
1 parent 18cade6 commit 0e6c78a

17 files changed

Lines changed: 581 additions & 353 deletions

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,6 @@ set(
132132
src/infer_response.h
133133
src/infer_request.cc
134134
src/infer_request.h
135-
src/message_queue.cc
136135
src/message_queue.h
137136
src/ipc_message.cc
138137
src/ipc_message.h
@@ -162,6 +161,8 @@ set(
162161
src/pb_env.h
163162
src/pb_metric_reporter.cc
164163
src/pb_metric_reporter.h
164+
src/memory_manager.cc
165+
src/memory_manager.h
165166
src/request_executor.cc
166167
src/request_executor.h
167168
)

README.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
# Copyright 2020-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -679,6 +679,12 @@ class TritonPythonModel:
679679
A complete example for sync and async BLS in Python backend is included in the
680680
[Examples](#examples) section.
681681

682+
Starting from the 22.04 release, the lifetime of the BLS output tensors have
683+
been improved such that if a tensor is no longer needed in your Python model it
684+
will be automatically deallocated. This can increase the number of BLS requests
685+
that you can execute in your model without running into the out of GPU or shared
686+
memory error.
687+
682688
Note: Async BLS is not supported on Python 3.6 or lower due to the `async` keyword
683689
and `asyncio.run` being introduced in Python 3.7.
684690

@@ -710,10 +716,6 @@ flags = pb_utils.TRITONSERVER_REQUEST_FLAG_SEQUENCE_START | pb_utils.TRITONSERVE
710716

711717
## Limitations
712718

713-
- The number of inference requests that can be executed as a part of your model
714-
execution is limited to the amount of shared memory available to the Triton
715-
server. If you are using Docker to start the TritonServer, you can control the
716-
shared memory usage using the
717719
[`--shm-size`](https://docs.docker.com/engine/reference/run/) flag.
718720
- You need to make sure that the inference requests performed as a part of your model
719721
do not create a circular dependency. For example, if model A performs an inference request

src/infer_request.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,17 @@ InferRequest::Exec()
423423
std::unique_ptr<InferResponse> infer_response =
424424
InferResponse::LoadFromSharedMemory(
425425
shm_pool, *response_handle, true /* open cuda handle */);
426+
auto& memory_manager_message_queue = stub->MemoryManagerQueue();
427+
428+
for (auto& output_tensor : infer_response->OutputTensors()) {
429+
if (!output_tensor->IsCPU()) {
430+
uint64_t memory_release_id = output_tensor->Memory()->MemoryReleaseId();
431+
output_tensor->Memory()->SetMemoryReleaseCallback(
432+
[&memory_manager_message_queue, memory_release_id]() {
433+
memory_manager_message_queue->Push(memory_release_id);
434+
});
435+
}
436+
}
426437

427438
return infer_response;
428439
} else {

src/memory_manager.cc

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#include "memory_manager.h"
28+
29+
30+
namespace triton { namespace backend { namespace python {
31+
32+
33+
#ifdef TRITON_ENABLE_GPU
34+
GPUMemoryRecord::GPUMemoryRecord(void* ptr)
35+
{
36+
ptr_ = ptr;
37+
release_callback_ = [](void* ptr) {
38+
cudaError_t err = cudaFree(ptr);
39+
if (err != cudaSuccess) {
40+
LOG_MESSAGE(
41+
TRITONSERVER_LOG_ERROR,
42+
(std::string("Failed to free the allocated cuda memory. error: ") +
43+
cudaGetErrorString(err))
44+
.c_str());
45+
}
46+
};
47+
}
48+
49+
void*
50+
GPUMemoryRecord::MemoryId()
51+
{
52+
return ptr_;
53+
}
54+
55+
const std::function<void(void*)>&
56+
GPUMemoryRecord::ReleaseCallback()
57+
{
58+
return release_callback_;
59+
}
60+
#endif
61+
62+
MemoryManager::MemoryManager(
63+
std::unique_ptr<MessageQueue<uint64_t>>&& memory_message_queue)
64+
{
65+
message_queue_ = std::move(memory_message_queue);
66+
record_count_ = 0;
67+
thread_ = std::thread(&MemoryManager::QueueMonitorThread, this);
68+
}
69+
70+
uint64_t
71+
MemoryManager::AddRecord(std::unique_ptr<MemoryRecord>&& memory_record)
72+
{
73+
std::lock_guard<std::mutex> lock{mu_};
74+
75+
record_count_++;
76+
records_.emplace(record_count_, std::move(memory_record));
77+
78+
return record_count_;
79+
}
80+
81+
void
82+
MemoryManager::QueueMonitorThread()
83+
{
84+
while (true) {
85+
uint64_t memory = message_queue_->Pop();
86+
if (memory == 0) {
87+
return;
88+
}
89+
90+
{
91+
std::lock_guard<std::mutex> lock{mu_};
92+
auto it = records_.find(memory);
93+
if (it == records_.end()) {
94+
LOG_MESSAGE(
95+
TRITONSERVER_LOG_ERROR,
96+
"Unexpected memory index received for deallocation.");
97+
continue;
98+
}
99+
100+
// Call the release callback.
101+
it->second->ReleaseCallback()(it->second->MemoryId());
102+
records_.erase(it);
103+
104+
// Reset the record_count_ when the number of records is zero.
105+
if (records_.size() == 0)
106+
record_count_ = 0;
107+
}
108+
}
109+
}
110+
111+
MemoryManager::~MemoryManager()
112+
{
113+
// Push a dummy 0 message that will trigger the destruction of the background
114+
// thread.
115+
message_queue_->Push(0);
116+
thread_.join();
117+
}
118+
119+
}}}; // namespace triton::backend::python

src/memory_manager.h

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
//
3+
// Redistribution and use in source and binary forms, with or without
4+
// modification, are permitted provided that the following conditions
5+
// are met:
6+
// * Redistributions of source code must retain the above copyright
7+
// notice, this list of conditions and the following disclaimer.
8+
// * Redistributions in binary form must reproduce the above copyright
9+
// notice, this list of conditions and the following disclaimer in the
10+
// documentation and/or other materials provided with the distribution.
11+
// * Neither the name of NVIDIA CORPORATION nor the names of its
12+
// contributors may be used to endorse or promote products derived
13+
// from this software without specific prior written permission.
14+
//
15+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
#include <functional>
28+
#include <mutex>
29+
#include <thread>
30+
#include <unordered_map>
31+
#include "message_queue.h"
32+
#include "triton/backend/backend_common.h"
33+
#include "triton/core/tritonserver.h"
34+
35+
#ifdef TRITON_ENABLE_GPU
36+
#include <cuda_runtime_api.h>
37+
#endif // TRITON_ENABLE_GPU
38+
39+
40+
namespace triton { namespace backend { namespace python {
41+
42+
class MemoryRecord {
43+
public:
44+
virtual const std::function<void(void*)>& ReleaseCallback() = 0;
45+
virtual void* MemoryId() = 0;
46+
};
47+
48+
#ifdef TRITON_ENABLE_GPU
49+
class GPUMemoryRecord : public MemoryRecord {
50+
public:
51+
GPUMemoryRecord(void* ptr);
52+
const std::function<void(void*)>& ReleaseCallback() override;
53+
void* MemoryId() override;
54+
55+
private:
56+
void* ptr_;
57+
std::function<void(void*)> release_callback_;
58+
};
59+
#endif
60+
61+
/// Memory manager class is used primarily for managing the lifetime of GPU
62+
/// tensors in BLS. It mainly consists of a background thread that monitors a
63+
/// message queue in shared memory. Whenever a GPU tensor is created, it will
64+
/// be pushed to the memory manager. The stub process must send a message to the
65+
/// message queue asking the memory manager to deallocate the GPU tensor.
66+
class MemoryManager {
67+
public:
68+
MemoryManager(std::unique_ptr<MessageQueue<uint64_t>>&& memory_message_queue);
69+
uint64_t AddRecord(std::unique_ptr<MemoryRecord>&& memory_record);
70+
TRITONSERVER_Error* ResetCounter();
71+
~MemoryManager();
72+
73+
private:
74+
std::thread thread_;
75+
std::unordered_map<uint64_t, std::unique_ptr<MemoryRecord>> records_;
76+
uint64_t record_count_;
77+
std::unique_ptr<MessageQueue<uint64_t>> message_queue_;
78+
void QueueMonitorThread();
79+
std::mutex mu_;
80+
};
81+
}}}; // namespace triton::backend::python

0 commit comments

Comments
 (0)