pytorch · apbose · Apr 14, 2025 · Apr 18, 2025 · narendasan · Apr 14, 2025
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -1206,7 +1206,7 @@ def save_cross_compiled_exported_program(
 
     from torch_tensorrt.dynamo._exporter import export
 
-    exp_program = export(gm, cross_compile_flag=True)
+    exp_program = export(gm, cross_compile_module=True)
     torch.export.save(exp_program, file_path)
     logger.debug(f"successfully saved the module for windows at {file_path}")
 

diff --git a/py/torch_tensorrt/dynamo/_exporter.py b/py/torch_tensorrt/dynamo/_exporter.py
@@ -22,23 +22,23 @@
 
 def export(
     gm: torch.fx.GraphModule,
-    cross_compile_flag: Optional[bool] = False,
+    cross_compile_module: Optional[bool] = False,
 ) -> ExportedProgram:
     """Export the result of TensorRT compilation into the desired output format.
 
     Arguments:
         gm (torch.fx.GraphModule): Compiled Torch-TensorRT module, generated by ``torch_tensorrt.dynamo.compile``
         inputs (torch.Tensor): Torch input tensors
-        cross_compile_flag (bool): Flag to indicated whether it is cross_compilation enabled or not
+        cross_compile_module (bool): Flag to indicated whether it is cross_compilation enabled or not
     """
-    patched_module = transform(gm, cross_compile_flag)
+    patched_module = transform(gm, cross_compile_module)
     exp_program = create_trt_exp_program(patched_module)
     return exp_program
 
 
 def transform(
     gm: torch.fx.GraphModule,
-    cross_compile_flag: Optional[bool] = False,
+    cross_compile_module: Optional[bool] = False,
 ) -> torch.fx.GraphModule:
     """
     Transforms the graphmodule by inlining Pytorch and TensorRT submodules.
@@ -48,7 +48,7 @@ def transform(
     Arguments:
         gm (torch.fx.GraphModule): Compiled Torch-TensorRT module, generated by ``torch_tensorrt.dynamo.compile``
         inputs (torch.Tensor): Torch input tensors
-        cross_compile_flag (bool): Flag to indicated whether it is cross_compilation enabled or not
+        cross_compile_module (bool): Flag to indicated whether it is cross_compilation enabled or not
 
     Returns an inlined torch.fx.GraphModule
     """
@@ -57,7 +57,7 @@ def transform(
     gm = copy.deepcopy(gm)
 
     # Inline TensorRT submodules
-    inline_trt_modules(gm, cross_compile_flag)
+    inline_trt_modules(gm, cross_compile_module)
 
     # Inline pytorch submodules
     inline_torch_modules(gm)
@@ -356,7 +356,7 @@ def create_trt_exp_program(
 
 
 def inline_trt_modules(
-    gm: torch.fx.GraphModule, cross_compile_flag: Optional[bool] = False
+    gm: torch.fx.GraphModule, cross_compile_module: Optional[bool] = False
 ) -> torch.fx.GraphModule:
     """
     Replace TRT submodules with trt engine nodes.
@@ -380,7 +380,16 @@ def inline_trt_modules(
         num_outputs = len(trt_module_node.meta["val"])
         # Insert a call_function node to perform inference on TRT engine
         with gm.graph.inserting_before(trt_module_node):
-            if not cross_compile_flag:
+            if cross_compile_module:
+                engine_info = trt_module._pack_engine_info()
+                engine_bytes = engine_info[ENGINE_IDX]
+                engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8")
+                # insert the no_placeholder node in the graph which should be replaced to the actual execute_engine node while load in the windows
+                trt_node = gm.graph.call_function(
+                    torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default,
+                    (trt_module_node.args, *engine_info),
+                )
+            else:
                 # for the normal workflow: use the execute_engine node
                 engine_name = f"{name}_engine"
                 setattr(gm, engine_name, trt_module.engine)
@@ -396,16 +405,6 @@ def inline_trt_modules(
                 engine_node.meta["val"] = CustomObjArgument(
                     name=engine_node.name, class_fqn=""
                 )
-            else:
-                # for the cross compile for windows workflow: use the no_op_placeholder node
-                engine_info = trt_module._pack_engine_info()
-                engine_bytes = engine_info[ENGINE_IDX]
-                engine_info[ENGINE_IDX] = base64.b64encode(engine_bytes).decode("utf-8")
-                # insert the no_placeholder node in the graph which should be replaced to the actual execute_engine node while load in the windows
-                trt_node = gm.graph.call_function(
-                    torch.ops.tensorrt.no_op_placeholder_for_execute_engine.default,
-                    (trt_module_node.args, *engine_info),
-                )
             # set trt_node.meta with trt_module_node.meta
             assert num_outputs > 0
             trt_node.meta["val"] = trt_module_node.meta["val"]
@@ -464,16 +463,10 @@ def replace_execute_engine_no_op_node(
                 name=engine_node.name, class_fqn=""
             )
 
-        if len(no_op_placeholder_node.meta["val"]) == 1:
-            with gm.graph.inserting_after(trt_node):
-                getitem_output = gm.graph.call_function(operator.getitem, (trt_node, 0))
-                getitem_output.meta["val"] = trt_node.meta["val"]
-            no_op_placeholder_node.replace_all_uses_with(getitem_output)
-        else:
-            no_op_placeholder_node.replace_all_uses_with(trt_node)
-            getitem_nodes = trt_node.users
-            for idx, getitem_node in enumerate(getitem_nodes):
-                getitem_node.meta["val"] = trt_node.meta["val"][idx]
+        no_op_placeholder_node.replace_all_uses_with(trt_node)
+        getitem_nodes = trt_node.users
+        for idx, getitem_node in enumerate(getitem_nodes):
+            getitem_node.meta["val"] = trt_node.meta["val"][idx]
 
         gm.graph.erase_node(no_op_placeholder_node)
 

diff --git a/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py b/py/torch_tensorrt/dynamo/runtime/meta_ops/register_meta_ops.py
@@ -150,3 +150,24 @@ def __setstate__(self, serialized_state: List[str]) -> Any:
 
     def __getstate__(self) -> Any:
         pass
+
+
+@torch.library.custom_op(
+    "tensorrt::no_op_placeholder_for_execute_engine", mutates_args=()
+)
+def no_op_placeholder_for_execute_engine(
+    inputs: List[torch.Tensor],
+    abi_version: str,
+    name: str,
+    serialized_device_info: str,
+    serialized_engine: str,
+    serialized_in_binding_names: str,
+    serialized_out_binding_names: str,
+    serialized_hardware_compatible: str,
+    serialized_metadata: str,
+    serialized_target_platform: str,
+    serialized_require_output_allocator: str,
+) -> List[torch.Tensor]:
+    raise RuntimeError(
+        "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api."
+    )
diff --git a/py/torch_tensorrt/runtime/_utils.py b/py/torch_tensorrt/runtime/_utils.py
@@ -128,23 +128,3 @@ def _get_most_compatible_device(
                 best_match = candidate
 
     return best_match
-
-
-@torch.library.custom_op(
-    "tensorrt::no_op_placeholder_for_execute_engine", mutates_args=()
-)
-def no_op_placeholder_for_execute_engine(
-    inputs: List[torch.Tensor],
-    abi_version: str,
-    name: str,
-    serialized_device_info: str,
-    serialized_engine: str,
-    serialized_in_binding_names: str,
-    serialized_out_binding_names: str,
-    serialized_hardware_compatible: str,
-    serialized_metadata: str,
-    serialized_target_platform: str,
-) -> List[torch.Tensor]:
-    raise RuntimeError(
-        "The saved model is cross compiled for windows in Linux, should only be loadded in Windows via torch_tensorrt.load_cross_compiled_exported_program() api."
-    )
diff --git a/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py b/tests/py/dynamo/runtime/test_003_cross_compile_for_windows.py
@@ -63,3 +63,31 @@ def forward(self, a, b):
             )
         except Exception as e:
             pytest.fail(f"unexpected exception raised: {e}")
+
+    @unittest.skipIf(
+        platform.system() != "Linux" or platform.architecture()[0] != "64bit",
+        "Cross compile for windows can only be enabled on linux x86-64 platform",
+    )
+    @pytest.mark.unit
+    def test_dynamo_cross_compile_for_windows_multiple_output(self):
+        class Add(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.add(a, b), torch.add(a, b)
+
+        model = Add().eval().cuda()
+        inputs = (torch.randn(2, 3).cuda(), torch.randn(2, 3).cuda())
+        trt_ep_path = os.path.join(tempfile.gettempdir(), "trt.ep")
+        exp_program = torch.export.export(model, inputs)
+        compile_spec = {
+            "inputs": inputs,
+            "min_block_size": 1,
+        }
+        try:
+            trt_gm = torch_tensorrt.dynamo.cross_compile_for_windows(
+                exp_program, **compile_spec
+            )
+            torch_tensorrt.dynamo.save_cross_compiled_exported_program(
+                trt_gm, file_path=trt_ep_path
+            )
+        except Exception as e:
+            pytest.fail(f"unexpected exception raised: {e}")