Skip to content

Commit 9d7ba1d

Browse files
authored
feat: Support for allocating GPU memory based on the selected profile (#108)
1 parent 2b060ca commit 9d7ba1d

File tree

4 files changed

+83
-28
lines changed

4 files changed

+83
-28
lines changed

README.md

+21
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,24 @@ but the listed CMake argument can be used to override.
9999
* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]
100100
* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]
101101
* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]
102+
103+
## Parameters
104+
105+
Triton exposes some flags to control the execution mode of the TensorRT models through
106+
the Parameters section of the model's `config.pbtxt` file.
107+
108+
### execution_context_allocation_strategy
109+
110+
Different memory allocation behaviors for IExecutionContext. IExecutionContext requires a block of device memory for internal activation tensors during inference. The user can let the execution context manage the memory in various ways. Current options are "STATIC" (default) and "ON_PROFILE_CHANGE".
111+
112+
* "STATIC": Default static allocation with the maximum size across all profiles.
113+
* "ON_PROFILE_CHANGE": Reallocate for a profile when it's selected.
114+
115+
```
116+
parameters: {
117+
key: "execution_context_allocation_strategy"
118+
value: {
119+
string_value: "STATIC"
120+
}
121+
}
122+
```

src/instance_state.cc

+14-25
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -1693,19 +1693,6 @@ ModelInstanceState::InitIOIndexMap()
16931693
TRITONSERVER_Error*
16941694
ModelInstanceState::InitOptimizationProfiles()
16951695
{
1696-
// TRT sets the optimization profile index to be 0 implicitly with
1697-
// the first context creation. As currently triton supports one
1698-
// context per engine, in order to set the specified profile_index,
1699-
// another context is created and the previous context is destroyed.
1700-
std::shared_ptr<nvinfer1::IExecutionContext> default_trt_context(
1701-
engine_->createExecutionContext());
1702-
if (default_trt_context == nullptr) {
1703-
return TRITONSERVER_ErrorNew(
1704-
TRITONSERVER_ERROR_INTERNAL,
1705-
(std::string("unable to create TensorRT context: ") +
1706-
model_state_->GetTensorRTLogger().LastErrorMsg())
1707-
.c_str());
1708-
}
17091696
std::vector<std::pair<std::string, int>> profile_name_index;
17101697
// No optimization profile is set for this TensorRT plan
17111698
if (ProfileNames().empty()) {
@@ -1736,17 +1723,19 @@ ModelInstanceState::InitOptimizationProfiles()
17361723
.c_str());
17371724
continue;
17381725
}
1739-
if (profile_index == 0) {
1740-
res.first->second.context_ = std::move(default_trt_context);
1741-
} else {
1742-
res.first->second.context_.reset(engine_->createExecutionContext());
1743-
if (res.first->second.context_ == nullptr) {
1744-
return TRITONSERVER_ErrorNew(
1745-
TRITONSERVER_ERROR_INTERNAL,
1746-
(std::string("unable to create TensorRT context: ") +
1747-
model_state_->GetTensorRTLogger().LastErrorMsg())
1748-
.c_str());
1749-
}
1726+
1727+
// Create a new execution context for the profile
1728+
res.first->second.context_.reset(
1729+
engine_->createExecutionContext(model_state_->AllocationStrategy()));
1730+
if (res.first->second.context_ == nullptr) {
1731+
return TRITONSERVER_ErrorNew(
1732+
TRITONSERVER_ERROR_INTERNAL,
1733+
(std::string("unable to create TensorRT context: ") +
1734+
model_state_->GetTensorRTLogger().LastErrorMsg())
1735+
.c_str());
1736+
}
1737+
1738+
if (profile_index != 0) {
17501739
if (!res.first->second.context_->setOptimizationProfileAsync(
17511740
profile_index, stream_)) {
17521741
return TRITONSERVER_ErrorNew(

src/model_state.cc

+40-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -142,7 +142,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
142142
}
143143

144144
ModelState::ModelState(TRITONBACKEND_Model* triton_model)
145-
: TensorRTModel(triton_model), engine_sharing_(true)
145+
: TensorRTModel(triton_model), engine_sharing_(true),
146+
alloc_strategy_(nvinfer1::ExecutionContextAllocationStrategy::kSTATIC)
146147
{
147148
// Obtain backend configuration
148149
TRITONBACKEND_Backend* backend;
@@ -288,6 +289,43 @@ ModelState::ValidateModelConfig()
288289
TRITONSERVER_Error*
289290
ModelState::ParseParameters()
290291
{
292+
triton::common::TritonJson::Value params;
293+
bool status = ModelConfig().Find("parameters", &params);
294+
if (status) {
295+
// If 'execution_context_allocation_strategy' is not present in
296+
// 'parameters', will use the default strategy "STATIC".
297+
std::string alloc_strategy;
298+
TRITONSERVER_Error* err = GetParameterValue(
299+
params, "execution_context_allocation_strategy", &alloc_strategy);
300+
if (err != nullptr) {
301+
if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
302+
return err;
303+
} else {
304+
TRITONSERVER_ErrorDelete(err);
305+
}
306+
} else {
307+
// 'execution_context_allocation_strategy' is present in model config
308+
// parameters.
309+
if (alloc_strategy == "STATIC") {
310+
alloc_strategy_ = nvinfer1::ExecutionContextAllocationStrategy::kSTATIC;
311+
} else if (alloc_strategy == "ON_PROFILE_CHANGE") {
312+
alloc_strategy_ =
313+
nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE;
314+
} else {
315+
return TRITONSERVER_ErrorNew(
316+
TRITONSERVER_ERROR_INVALID_ARG,
317+
("Invalid value for 'execution_context_allocation_strategy': '" +
318+
alloc_strategy + "' for model instance '" + Name() +
319+
"'. Supported values are 'STATIC' and 'ON_PROFILE_CHANGE'.")
320+
.c_str());
321+
}
322+
LOG_MESSAGE(
323+
TRITONSERVER_LOG_INFO,
324+
("'execution_context_allocation_strategy' set to '" + alloc_strategy +
325+
"' for model instance '" + Name() + "'")
326+
.c_str());
327+
}
328+
}
291329
return nullptr; // success
292330
}
293331

src/model_state.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -88,6 +88,11 @@ class ModelState : public TensorRTModel {
8888

8989
TensorRTLogger& GetTensorRTLogger() { return tensorrt_logger_; }
9090

91+
nvinfer1::ExecutionContextAllocationStrategy AllocationStrategy() const
92+
{
93+
return alloc_strategy_;
94+
}
95+
9196
private:
9297
ModelState(TRITONBACKEND_Model* triton_model);
9398

@@ -140,6 +145,8 @@ class ModelState : public TensorRTModel {
140145

141146
// Whether the backend should support version-compatible TensorRT models.
142147
static inline bool is_version_compatible_{false};
148+
149+
nvinfer1::ExecutionContextAllocationStrategy alloc_strategy_;
143150
};
144151

145152

0 commit comments

Comments
 (0)