Fix dynamically sized i/o

ericcraw · ericcraw · commit 4f32f22c21b7 · 2025-05-27T15:37:55.000-07:00
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -361,7 +361,7 @@ void BasicBackend::SetNumThreads(ov::AnyMap& device_config) {
 
 #ifdef IO_BUFFER_ENABLED
 // Wait for Remote Aynchronous inference completion
-void BasicBackend::RemoteInfer(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
+void BasicBackend::RemoteInfer(Ort::KernelContext& context, OVInferRequestPtr infer_request) const {
   try {
     auto graph_input_info = exe_network_.Get().inputs();
     int input_idx = 0;
@@ -467,7 +467,7 @@ void BasicBackend::RemoteInfer(Ort::KernelContext& context, OVInferRequestPtr in
 }
 #endif
 
-void BasicBackend::Infer(OrtKernelContext* ctx) {
+void BasicBackend::Infer(OrtKernelContext* ctx) const {
   Ort::KernelContext context(ctx);
 
   LOGS_DEFAULT(INFO) << log_tag << "Running graph " << subgraph_context_.subgraph_name;
@@ -492,43 +492,64 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
     return;
   }
 
-  bool gpu = session_context_.device_type.find("GPU") != std::string::npos;
-  bool cpu_or_gpu = gpu || (session_context_.device_type.find("CPU") != std::string::npos);
-
   // guarded_request will be released back to the pool when it goes out of scope
   auto guarded_request = infer_req_pool_->getRequest();
   auto& infer_request = guarded_request.infer_request_;
 #ifdef IO_BUFFER_ENABLED
-  if (gpu &&
+  if (session_context_.device_type.find("GPU") != std::string::npos &&
       (session_context_.context != nullptr) && session_context_.is_wholly_supported_graph) {
     RemoteInfer(context, infer_request);
   } else
 #else
   {  // scope for else if IO_BUFFER_ENABLED
 
-    // Bind inputs
-    for (const auto& input_info : bindings_->network_inputs_) {
-      if (subgraph_context_.has_dynamic_input_shape &&
-          !session_context_.disable_dynamic_shapes &&
-          cpu_or_gpu) {
-        // copy the input to set current shape.
-        auto input_info_copy = input_info;
+    if (bindings_->has_dynamic_io_ ||
+        (subgraph_context_.has_dynamic_input_shape &&
+         !session_context_.disable_dynamic_shapes)) {
+      // Dynamic shape inference
+
+      // We don't know the output shapes so we need to get the outputs from the infer request and copy them into the ort
+      // tensors instead of binding them to the infer request directly.
+
+      // Bind inputs
+      for (const auto& input_info : bindings_->network_inputs_) {
+        // Set the input shape based on the input tensor from ort
         auto tensor = context.GetInput(input_info.onnx_index);
-        input_info_copy.shape = ParameterShape(tensor.GetTensorTypeAndShapeInfo().GetShape());
+        auto input_shape = ParameterShape(tensor.GetTensorTypeAndShapeInfo().GetShape());
 
-        infer_request->SetTensor(input_info_copy, const_cast<void*>(tensor.GetTensorRawData()));
-      } else {
+        infer_request->SetTensorShapeOverride(input_info, input_shape, const_cast<void*>(tensor.GetTensorRawData()));
+      }
+
+      // Run Inference
+      infer_request->Infer();
+
+      // Copy outputs
+      for (const auto& output_info : bindings_->network_outputs_) {
+        auto ov_tensor = infer_request->GetTensor(output_info.name);
+        auto output_shape = ParameterShape::ToOnnxShape(ov_tensor->get_shape());
+        auto ort_tensor = context.GetOutput(output_info.onnx_index, output_shape);
+
+        memcpy_s(ort_tensor.GetTensorMutableRawData(),
+                 ort_tensor.GetTensorSizeInBytes(),
+                 ov_tensor->data(),
+                 ov_tensor->get_byte_size());
+      }
+    } else {
+      // Static shape inference
+
+      // Bind inputs
+      for (const auto& input_info : bindings_->network_inputs_) {
         infer_request->SetTensor(input_info, const_cast<void*>(context.GetInput(input_info.onnx_index).GetTensorRawData()));
       }
-    }
 
-    // Bind outputs
-    for (const auto& output_info : bindings_->network_outputs_) {
-      infer_request->SetTensor(output_info, context.GetOutput(output_info.onnx_index, output_info.shape.onnx()).GetTensorMutableRawData());
-    }
+      // Bind outputs
+      for (const auto& output_info : bindings_->network_outputs_) {
+        infer_request->SetTensor(output_info, context.GetOutput(output_info.onnx_index, output_info.shape.onnx()).GetTensorMutableRawData());
+      }
 
-    // Run Inference
-    infer_request->Infer();
+      // Run Inference
+      infer_request->Infer();
+    }
 
     // Fill constant outputs if needed
     for (const auto& [name, node] : const_outputs_map_) {
@@ -552,7 +573,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) {
 
 #ifndef NDEBUG
 #ifndef IO_BUFFER_ENABLED
-  // Print performance counts before releasing the infer_request for potential thread safety
+  // Print performance counts before releasing the infer_request for thread safety
   if (openvino_ep::backend_utils::IsDebugEnabled()) {
     std::string& hw_target = session_context_.device_type;
     printPerformanceCounts(infer_request, std::cout, hw_target);
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -28,6 +28,7 @@ namespace openvino_ep {
 struct OnnxToOvNetworkBindings {
   std::vector<ParameterInfo> network_outputs_;
   std::vector<ParameterInfo> network_inputs_;
+  bool has_dynamic_io_ = false;
 
   OnnxToOvNetworkBindings(OVExeNetwork& exec_network, SubGraphContext& subgraph_context) {
     auto populate = [&](auto& input_output_map, const SubGraphContext::string_index_map_t& onnx_input_map, const auto& ov_parameters) {
@@ -42,6 +43,9 @@ struct OnnxToOvNetworkBindings {
         auto ov_param_index = std::distance(ov_parameters.begin(), it);
 
         auto shape = ov_parameters[ov_param_index].get_partial_shape();
+        if (shape.is_dynamic()) {
+          has_dynamic_io_ = true;
+        }
         auto type = ov_parameters[ov_param_index].get_element_type();
         ParameterInfo info{onnx_name, ov_param_index, onnx_param_index, type, shape};
         input_output_map.push_back(std::move(info));
@@ -62,7 +66,7 @@ class BasicBackend : public IBackend {
                SharedContext& shared_context,
                ptr_stream_t& model_stream);
 
-  void Infer(OrtKernelContext* context) override;
+  void Infer(OrtKernelContext* context) const override;
   ~BasicBackend() override = default;
   ov::CompiledModel GetOVCompiledModel() override {
     return exe_network_.Get();
@@ -77,13 +81,12 @@ class BasicBackend : public IBackend {
   void SetNumThreads(ov::AnyMap& device_config);
 
 #ifdef IO_BUFFER_ENABLED
-  void RemoteInfer(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request);
+  void RemoteInfer(Ort::KernelContext& context, std::shared_ptr<OVInferRequest> infer_request) const;
 #endif
 
   SessionContext& session_context_;
   SubGraphContext subgraph_context_;
   SharedContext& shared_context_;
-  mutable std::mutex compute_lock_;
   OVExeNetwork exe_network_;
   std::map<std::string, std::shared_ptr<ov::Node>> const_outputs_map_;
   std::unique_ptr<InferRequestPool> infer_req_pool_;
@@ -92,7 +95,7 @@ class BasicBackend : public IBackend {
 #endif
 
   using ort_tensor_key_t = const std::string;
-  std::unique_ptr<OnnxToOvNetworkBindings> bindings_;
+  std::unique_ptr<const OnnxToOvNetworkBindings> bindings_;
 };
 
 class InferRequestPool {
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
@@ -14,8 +14,8 @@ namespace openvino_ep {
 
 class IBackend {
  public:
-  virtual void Infer(OrtKernelContext* context) = 0;
-  virtual ov::CompiledModel GetOVCompiledModel() = 0;
+  virtual void Infer(OrtKernelContext* context) const = 0;
+  virtual ov::CompiledModel& GetOVCompiledModel() = 0;
   virtual ~IBackend() = default;
 };
 using ptr_stream_t = std::unique_ptr<std::istream>;
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -38,7 +38,6 @@ typedef ov::intel_gpu::ocl::ClContext* OVRemoteContextPtr;
 typedef ov::RemoteContext OVRemoteContext;
 #endif
 
-
 struct ParameterShape {
   using onnx_shape_t = std::vector<int64_t>;
 
@@ -55,16 +54,12 @@ struct ParameterShape {
     return ov::PartialShape(ov_shape);
   }
 
-  static ov::Shape ToOvShape(const onnx_shape_t& onnx_shape) {
-    return ToOvPartialShape(onnx_shape).get_shape();
-  }
-
   static onnx_shape_t ToOnnxShape(const ov::PartialShape& ov_shape) {
-      onnx_shape_t onnx_shape(ov_shape.size());
+    onnx_shape_t onnx_shape(ov_shape.size());
     std::transform(ov_shape.begin(), ov_shape.end(), onnx_shape.begin(), [](const auto& dim) {
       return dim.is_dynamic() ? -1 : dim.get_length();
     });
-      return onnx_shape;
+    return onnx_shape;
   }
 
   static bool IsDynamic(const ov::PartialShape& ov_shape) {
@@ -189,9 +184,14 @@ class OVInferRequest {
 
   // Set tensor described param_info and ort_ptr. Call infer req tensor if ort_ptr is last set.
   void SetTensor(const ParameterInfo& param_info, void* ort_ptr) {
+    SetTensorShapeOverride(param_info, param_info.shape, ort_ptr);
+  }
+
+  // Set tensor described param_info and ort_ptr. Overrides shape in param_info with shape_override. Call infer req tensor if ort_ptr is last set.
+  void SetTensorShapeOverride(const ParameterInfo& param_info, const ParameterShape& shape_override, void* ort_ptr) {
     auto& cached_binding = bindings_cache_[param_info.name];
     if (cached_binding.ort_ptr != ort_ptr) {
-      auto tensor_ptr = std::make_shared<ov::Tensor>(param_info.type, param_info.shape.ov_shape(), const_cast<void*>(ort_ptr));
+      auto tensor_ptr = std::make_shared<ov::Tensor>(param_info.type, shape_override.ov_shape(), const_cast<void*>(ort_ptr));
       SetTensor(param_info.name, tensor_ptr);
       cached_binding = {tensor_ptr, ort_ptr};
     }