Skip to content

Commit fff68c3

Browse files
authored
Avoid reusing buffer for node outputs with no consumers (#21019)
1 parent 846cac6 commit fff68c3

7 files changed

+152
-15
lines changed

onnxruntime/core/framework/allocation_planner.cc

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,15 @@ class PlannerImpl {
469469
*/
470470
}
471471

472+
static bool OutputHasConsumerNode(const Node& node, int output_idx) {
473+
// there will be an edge to all consumer nodes.
474+
// if consumed in a subgraph the edge will be to an implicit input of the node containing the subgraph.
475+
return std::any_of(node.OutputEdgesBegin(), node.OutputEdgesEnd(),
476+
[&output_idx](const Node::EdgeEnd& edge) {
477+
return edge.GetSrcArgIndex() == output_idx;
478+
});
479+
}
480+
472481
bool SameSize(const onnxruntime::NodeArg& arg1, const onnxruntime::NodeArg& arg2) {
473482
if ((!arg1.Exists()) || (!arg2.Exists())) return false;
474483
auto p_shape1 = context_->GetShape(arg1);
@@ -1172,8 +1181,8 @@ class PlannerImpl {
11721181
value_consumer_map[output_idx_global].end());
11731182
reused.insert(reusable_input);
11741183
continue;
1175-
} // if
1176-
} // if
1184+
}
1185+
}
11771186
}
11781187
}
11791188

@@ -1456,7 +1465,13 @@ class PlannerImpl {
14561465
} else if (IsNonTensor(*node_output)) {
14571466
AllocPlan(current).alloc_kind = AllocKind::kAllocate;
14581467
} else if (!context_->IsParallelExecutionEnabled() &&
1468+
OutputHasConsumerNode(*pnode, static_cast<int>(output_arg_def_index)) &&
14591469
FindReusableTensor(*node_output, &reused)) {
1470+
// The check that OutputHasConsumerNode is to handle an edge case where a node produces a value that is
1471+
// not consumed by any other nodes. If we set it to kReuse the buffer will be freed prematurely as the
1472+
// logic in GenerateDeallocationPlan is based on processing consumer nodes. Changing the implementation of
1473+
// GenerateDeallocationPlan is an alternative but that would be a much bigger change.
1474+
14601475
// Reuse an available (dead) buffer for this output, this is only for sequential execution.
14611476
Reuse(reused, current, AllocKind::kReuse);
14621477
} else {
@@ -1906,8 +1921,8 @@ class PlannerImpl {
19061921
node_to_wait[it->Index()].insert({node_index, wait_handle});
19071922
}
19081923
}
1909-
} // output->Exists
1910-
} // for each output
1924+
}
1925+
}
19111926
if (output_consumed_in_subgraph) {
19121927
const auto downstream = plan_.node_stream_map_[it->Index()];
19131928
if (downstream != i) {

onnxruntime/core/framework/session_state.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1410,7 +1410,7 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string<PATH_CHAR_
14101410
// Record the allocation plan
14111411

14121412
// Uncomment the below to dump the allocation plan to std::cout
1413-
// LOGS(logger_, VERBOSE) << std::make_pair(p_seq_exec_plan_.get(), this);
1413+
// std::cout << std::make_pair(&*p_seq_exec_plan_, this);
14141414

14151415
#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
14161416
GetMemoryProfiler()->Init(GetExecutionPlan(), GetOrtValueNameIdxMap());

onnxruntime/test/framework/allocation_planner_test.cc

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2040,5 +2040,42 @@ TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
20402040
ASSERT_EQ(gather_count, 4) << "4 gather ops are all placed in CPU stream";
20412041
}
20422042
#endif
2043+
2044+
#ifdef ENABLE_TRAINING_OPS
2045+
// use a carefully constructed model to re-produce a customer reported issue where a model produced invalid output.
2046+
// this issue required:
2047+
// - buffer A that is re-used later in the model
2048+
// - output of the first Shape node
2049+
// - first usage completes after the following Cast node
2050+
// - buffer B which has the same size requirement and is used after the first usage of A is complete
2051+
// - buffer B is used for the output from `squeeze2` and a number of other nodes in that part of the model.
2052+
// - re-use of buffer A for an output of a node that has no consumers whilst buffer B is still in use
2053+
// - this is the `per_input_length` output of the ConcatTraining node
2054+
//
2055+
// Because the logic to determine when a buffer can be freed is based on consumers, buffer A gets freed after the
2056+
// Cast node. It is then re-used as buffer B because the memory pattern planner believes that block to be available.
2057+
// When we re-use buffer A for the ConcatTraining output we are using the same address for two different node output
2058+
// buffers, leading to corruption of the output.
2059+
// This tests that the change in allocation planner to not re-use a buffer for outputs with no consumers prevents this.
2060+
TEST(AllocationPlannerTest, AvoidReuseOfBufferForNodeOutputWithNoConsumers) {
2061+
SessionOptions sess_opt;
2062+
sess_opt.graph_optimization_level = TransformerLevel::Default;
2063+
2064+
InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/avoid_reuse_of_buffer_for_node_output_with_no_consumers.onnx"));
2065+
auto status = sess.Load();
2066+
status = sess.Initialize();
2067+
ASSERT_TRUE(status.IsOK());
2068+
2069+
const auto& session_state = sess.GetSessionState();
2070+
const auto& ort_value_index_map = session_state.GetOrtValueNameIdxMap();
2071+
const SequentialExecutionPlan* plan = session_state.GetExecutionPlan();
2072+
2073+
OrtValueIndex concat_training_unused_out_index;
2074+
// Here per_input_length output of the ConcatTraining node has no consumers, so it should not reuse the buffer.
2075+
ASSERT_STATUS_OK(ort_value_index_map.GetIdx("per_input_length", concat_training_unused_out_index));
2076+
EXPECT_EQ(plan->allocation_plan[concat_training_unused_out_index].alloc_kind, AllocKind::kAllocate);
2077+
}
2078+
#endif
2079+
20432080
} // namespace test
20442081
} // namespace onnxruntime

onnxruntime/test/framework/execution_frame_test.cc

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -454,27 +454,42 @@ TEST_F(ExecutionFrameTest, MemPatternWithExternalOutputsTest) {
454454
#endif
455455

456456
TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {
457-
// load model with 2 Scan ops that both incorrectly use shapes of { 'None', 'None' } for their outputs.
458-
// as 'None' is not a special value it's treated as a variable name, leading to a runtime error when we
459-
// attempt to re-use the output from the first Scan node for the second. validate we detect this and error out.
457+
// Model that has 2 inputs with shape {'Symbolic', 'Symbolic'} that is carefully constructed to re-use a
458+
// buffer the size of one input for output the size of the other input.
459+
// The model is fine if all values of 'Symbolic' are the same, but invalid if they are not.
460+
// As both inputs claim to have the same size, the allocation plan is based on that.
461+
// Code in ExecutionFrame catches what would result in buffer overflow if input 2 is actually larger than input 1
462+
// and we're attempting to re-use a buffer the size of input 1.
463+
// The 'real' problem being tested is inconsistent values for a dim_param in a model, which could occur anywhere
464+
// in the model.
460465
SessionOptions so;
461466
so.session_logid = "BadModelInvalidDimParamUsage";
462467

463468
InferenceSession session_object{so, GetEnvironment()};
464469
ASSERT_STATUS_OK(session_object.Load("testdata/invalid_dim_param_value_repetition.onnx"));
465470
ASSERT_STATUS_OK(session_object.Initialize());
466471

467-
std::vector<int64_t> dims_X = {10, 6};
468-
std::vector<float> values_X;
469-
values_X.reserve(60);
472+
std::vector<int64_t> dims_X1 = {10, 6};
473+
std::vector<float> values_X1;
474+
values_X1.reserve(60);
470475
for (int i = 0; i < 60; ++i) {
471-
values_X.push_back(float(i));
476+
values_X1.push_back(float(i));
472477
}
473478

474-
OrtValue ml_value;
475-
CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X, values_X, &ml_value);
479+
std::vector<int64_t> dims_X2 = {10, 12};
480+
std::vector<float> values_X2;
481+
values_X2.reserve(120);
482+
for (int i = 0; i < 120; ++i) {
483+
values_X2.push_back(float(i));
484+
}
485+
486+
OrtValue ml_value1;
487+
CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X1, values_X1, &ml_value1);
488+
OrtValue ml_value2;
489+
CreateMLValue<float>(TestCPUExecutionProvider()->CreatePreferredAllocators()[0], dims_X2, values_X2, &ml_value2);
476490
NameMLValMap feeds;
477-
feeds.insert(std::make_pair("X", ml_value));
491+
feeds.insert({"X1", ml_value1});
492+
feeds.insert({"X2", ml_value2});
478493

479494
// prepare outputs
480495
std::vector<std::string> output_names;
Binary file not shown.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
Run this script to recreate the original onnx model.
3+
Example usage:
4+
python invalid_dim_param_value_repetition.py
5+
"""
6+
7+
import numpy as np
8+
import onnx
9+
10+
11+
def order_repeated_field(repeated_proto, key_name, order):
12+
order = list(order)
13+
repeated_proto.sort(key=lambda x: order.index(getattr(x, key_name)))
14+
15+
16+
def make_node(op_type, inputs, outputs, name=None, doc_string=None, domain=None, **kwargs):
17+
node = onnx.helper.make_node(op_type, inputs, outputs, name, doc_string, domain, **kwargs)
18+
if doc_string == "":
19+
node.doc_string = ""
20+
order_repeated_field(node.attribute, "name", kwargs.keys())
21+
return node
22+
23+
24+
def make_graph(*args, doc_string=None, **kwargs):
25+
graph = onnx.helper.make_graph(*args, doc_string=doc_string, **kwargs)
26+
if doc_string == "":
27+
graph.doc_string = ""
28+
return graph
29+
30+
31+
model = onnx.helper.make_model(
32+
opset_imports=[onnx.helper.make_operatorsetid("", 11)],
33+
ir_version=5,
34+
producer_name="skl2onnx",
35+
producer_version="1.5.9999",
36+
domain="ai.onnx",
37+
model_version=0,
38+
graph=make_graph(
39+
name="OnnxIdentity",
40+
inputs=[
41+
onnx.helper.make_tensor_value_info("X1", onnx.TensorProto.FLOAT, shape=["Symbolic", "Symbolic"]),
42+
onnx.helper.make_tensor_value_info("X2", onnx.TensorProto.FLOAT, shape=["Symbolic", "Symbolic"]),
43+
],
44+
outputs=[
45+
onnx.helper.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, shape=[None, None]),
46+
],
47+
initializer=[
48+
onnx.numpy_helper.from_array(np.array([0.10000000149011612], dtype="float32"), name="Addcst"),
49+
],
50+
nodes=[
51+
# take an input. Add to create a local output buffer for O01.
52+
make_node("Add", inputs=["X1", "Addcst"], outputs=["O01"], name="Add1", domain=""),
53+
# Use Shape -> ConstantOfShape to make O01 available for reuse
54+
make_node("Shape", inputs=["O01"], outputs=["O02"], name="Shape1", domain=""),
55+
# ConstantOfShape to get back to the right rank, and ReduceSum so the value is broadcastable in the
56+
# the downstream Add
57+
make_node("ConstantOfShape", inputs=["O02"], outputs=["O03"], name="ConstantOfShape ", domain=""),
58+
make_node("ReduceSum", inputs=["O03"], outputs=["O04"], name="ReduceSum1", domain=""),
59+
# Two Add nodes with the ReduceSum output. One could be in-place, but the other needs a buffer.
60+
# This should trigger attempted re-use of O01, so provided X2 is larger than X1 that should break
61+
make_node("Add", inputs=["O04", "X2"], outputs=["O05"], name="Add2", domain=""),
62+
make_node("Add", inputs=["X2", "O04"], outputs=["O06"], name="Add3", domain=""),
63+
# concat to separate the Add outputs from graph output (which is always allocated)
64+
make_node("Concat", inputs=["O05", "O06"], outputs=["Y"], axis=-1, name="Concat", domain=""),
65+
],
66+
),
67+
)
68+
69+
if __name__ == "__main__":
70+
onnx.save(model, "invalid_dim_param_value_repetition.onnx")

0 commit comments

Comments
 (0)