Skip to content

Commit 0ab2ca5

Browse files
authored
Add checks for flags values and improve shm growth error handling (triton-inference-server#45)
* Add checks for flags values * Improve error handling when growing shared memory
1 parent 91e5fc2 commit 0ab2ca5

5 files changed

Lines changed: 219 additions & 85 deletions

File tree

src/pb_stub.cc

Lines changed: 44 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,34 @@ class Stub {
199199

200200
std::unique_ptr<SharedMemory>& GetSharedMemory() { return shm_pool_; }
201201

202+
void SetErrorForResponse(Response* response, const char* err_message)
203+
{
204+
off_t err_string_offset = 0;
205+
response->is_error_set = false;
206+
response->has_error = true;
207+
LOG_IF_EXCEPTION(
208+
SaveStringToSharedMemory(shm_pool_, err_string_offset, err_message));
209+
210+
if (err_string_offset != 0) {
211+
response->error = err_string_offset;
212+
response->is_error_set = true;
213+
}
214+
}
215+
216+
void SetErrorForResponseBatch(const char* err_message)
217+
{
218+
off_t err_string_offset = 0;
219+
response_batch_->is_error_set = false;
220+
response_batch_->has_error = true;
221+
LOG_IF_EXCEPTION(
222+
SaveStringToSharedMemory(shm_pool_, err_string_offset, err_message));
223+
224+
if (err_string_offset != 0) {
225+
response_batch_->error = err_string_offset;
226+
response_batch_->is_error_set = true;
227+
}
228+
}
229+
202230
void ProcessResponse(
203231
Response* response_shm, ResponseBatch* response_batch,
204232
py::handle response, py::object& serialize_bytes)
@@ -210,15 +238,11 @@ class Stub {
210238
bool has_error = py_has_error;
211239

212240
if (has_error) {
213-
response_shm->has_error = true;
214-
off_t err_string_offset;
215241
py::str py_string_err = py::str(response.attr("error")());
216242
std::string response_error = py_string_err;
217-
LOG_IF_EXCEPTION(SaveStringToSharedMemory(
218-
shm_pool_, err_string_offset, response_error.c_str()));
219-
response_shm->error = err_string_offset;
243+
SetErrorForResponse(response_shm, response_error.c_str());
220244

221-
// Skip the response value if it has error
245+
// Skip the response value when the response has error.
222246
return;
223247
}
224248

@@ -262,10 +286,7 @@ class Stub {
262286
if (serialize_bytes.is_none()) {
263287
const char* err_message = "An error happened during serialization.";
264288
LOG_INFO << err_message;
265-
off_t err_message_offset;
266-
SaveStringToSharedMemory(shm_pool_, err_message_offset, err_message);
267-
response_shm->has_error = true;
268-
response_shm->error = err_message_offset;
289+
SetErrorForResponse(response_shm, err_message);
269290
return;
270291
}
271292

@@ -419,12 +440,7 @@ class Stub {
419440

420441
void SetResponseFromException(const PythonBackendException& pb_exception)
421442
{
422-
off_t err_string_offset;
423-
LOG_IF_EXCEPTION(SaveStringToSharedMemory(
424-
shm_pool_, err_string_offset,
425-
pb_exception.err_->error_message.c_str()));
426-
response_batch_->has_error = true;
427-
response_batch_->error = err_string_offset;
443+
SetErrorForResponseBatch(pb_exception.err_->error_message.c_str());
428444
}
429445

430446
int Execute()
@@ -485,12 +501,7 @@ class Stub {
485501
std::string message = "Python model " + model_path_ +
486502
" does not implement `execute` method.";
487503
LOG_INFO << message;
488-
489-
off_t err_string_offset;
490-
LOG_IF_EXCEPTION(SaveStringToSharedMemory(
491-
shm_pool_, err_string_offset, message.c_str()));
492-
response_batch_->has_error = true;
493-
response_batch_->error = err_string_offset;
504+
SetErrorForResponseBatch(message.c_str());
494505

495506
return 0;
496507
}
@@ -500,12 +511,9 @@ class Stub {
500511
responses = model_instance_.attr("execute")(py_request_list);
501512
}
502513
catch (const py::error_already_set& e) {
503-
off_t err_string_offset;
504514
LOG_INFO << e.what();
505-
LOG_IF_EXCEPTION(
506-
SaveStringToSharedMemory(shm_pool_, err_string_offset, e.what()));
507-
response_batch_->has_error = true;
508-
response_batch_->error = err_string_offset;
515+
SetErrorForResponseBatch(e.what());
516+
509517
return 0;
510518
}
511519

@@ -536,13 +544,8 @@ class Stub {
536544
catch (const PythonBackendException& pb_exception) {
537545
LOG_EXCEPTION(pb_exception);
538546
pb_exception.err_->error_message.c_str();
539-
540-
off_t err_string_offset;
541-
LOG_IF_EXCEPTION(SaveStringToSharedMemory(
542-
shm_pool_, err_string_offset,
543-
pb_exception.err_->error_message.c_str()));
544-
response_shm->has_error = true;
545-
response_shm->error = err_string_offset;
547+
SetErrorForResponse(
548+
response_shm, pb_exception.err_->error_message.c_str());
546549
}
547550
i += 1;
548551
}
@@ -596,11 +599,8 @@ class Stub {
596599

597600
catch (const py::error_already_set& e) {
598601
LOG_INFO << e.what();
602+
SetErrorForResponseBatch(e.what());
599603

600-
off_t err_string_offset;
601-
SaveStringToSharedMemory(shm_pool_, err_string_offset, e.what()),
602-
response_batch_->has_error = true;
603-
response_batch_->error = err_string_offset;
604604
NotifyParent();
605605
exit(1);
606606
}
@@ -685,7 +685,8 @@ main(int argc, char** argv)
685685

686686
pid_t parent_pid = std::stoi(argv[5]);
687687
bool background_thread_running = true;
688-
std::thread background_thread([&parent_pid, &background_thread_running] {
688+
std::thread background_thread([&parent_pid, &background_thread_running,
689+
&stub] {
689690
while (background_thread_running) {
690691
// Every two seconds check if the parent process is alive.
691692
sleep(2);
@@ -695,6 +696,10 @@ main(int argc, char** argv)
695696
}
696697

697698
pid_t child_pid = getpid();
699+
700+
// Destroy Stub
701+
stub.reset();
702+
698703
// Kill the process
699704
kill(child_pid, SIGTERM);
700705
LOG_INFO << "Non-graceful termination detected. Killing the child stub: "

src/pb_utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,15 @@ struct Response {
112112
uint32_t outputs_size;
113113
off_t error;
114114
bool has_error;
115+
bool is_error_set; // Indicates whether this error has a message or not.
115116
};
116117

117118
struct ResponseBatch {
118119
off_t responses; // Offset for response object.
119120
uint32_t batch_size;
120121
off_t error;
121122
bool has_error;
123+
bool is_error_set; // Indicates whether this error has a message or not.
122124
};
123125

124126
struct RequestBatch {

0 commit comments

Comments
 (0)