pytorch
diff --git a/‎examples/fx/fx2trt_example_next.py
+4-1 b/‎examples/fx/fx2trt_example_next.py
+4-1
diff --git a/‎py/torch_tensorrt/__init__.py
-1 b/‎py/torch_tensorrt/__init__.py
-1
diff --git a/‎py/torch_tensorrt/_TRTModuleNext.py renamed to ‎py/torch_tensorrt/dynamo/_TorchTensorRTModule.py
+10-11 b/‎py/torch_tensorrt/_TRTModuleNext.py renamed to ‎py/torch_tensorrt/dynamo/_TorchTensorRTModule.py
+10-11
diff --git a/‎py/torch_tensorrt/dynamo/backend/__init__.py
+11 b/‎py/torch_tensorrt/dynamo/backend/__init__.py
+11
diff --git a/‎py/torch_tensorrt/dynamo/backend/_defaults.py
+1 b/‎py/torch_tensorrt/dynamo/backend/_defaults.py
+1
diff --git a/‎py/torch_tensorrt/dynamo/backend/_settings.py
+2 b/‎py/torch_tensorrt/dynamo/backend/_settings.py
+2
diff --git a/‎py/torch_tensorrt/dynamo/backend/backends.py
+1 b/‎py/torch_tensorrt/dynamo/backend/backends.py
+1
diff --git a/‎py/torch_tensorrt/dynamo/backend/conversion.py
+22-7 b/‎py/torch_tensorrt/dynamo/backend/conversion.py
+22-7
diff --git a/‎py/torch_tensorrt/dynamo/backend/test/test_backend_compiler.py
+173 b/‎py/torch_tensorrt/dynamo/backend/test/test_backend_compiler.py
+173
@@ -8,7 +8,10 @@
 import torch_tensorrt.fx.tracer.acc_tracer.acc_tracer as acc_tracer
 from torch_tensorrt.fx import InputTensorSpec, TRTInterpreter
 from torch_tensorrt.fx.tools.trt_splitter import TRTSplitter, TRTSplitterSetting
-from torch_tensorrt import TRTModuleNext as TRTModule, Device
+from torch_tensorrt.dynamo._TorchTensorRTModule import (
+    TorchTensorRTModule as TRTModule,
+    Device,
+)
 
 # The purpose of this example is to demonstrate the overall flow of lowering a PyTorch
 # model to TensorRT via FX with existing FX based tooling. The general lowering flow
 
@@ -91,7 +91,6 @@ def _find_lib(name, paths):
 from torch_tensorrt import logging
 from torch_tensorrt._Input import Input
 from torch_tensorrt._Device import Device
-from torch_tensorrt._TRTModuleNext import TRTModuleNext
 
 from torch_tensorrt import fx
 
 
@@ -1,6 +1,5 @@
 import logging
-from operator import truediv
-from typing import Any, List, Sequence, Tuple
+from typing import Any, List, Tuple
 
 import torch
 from torch_tensorrt import _C
@@ -9,8 +8,8 @@
 logger = logging.getLogger(__name__)
 
 
-class TRTModuleNext(torch.nn.Module):
-    """TRTModuleNext is a PyTorch module which encompasses an arbitrary TensorRT Engine.
+class TorchTensorRTModule(torch.nn.Module):
+    """TorchTensorRTModule is a PyTorch module which encompasses an arbitrary TensorRT Engine.
 
     This module is backed by the Torch-TensorRT runtime and is fully compatibile with both
     FX / Python deployments (just ``import torch_tensorrt`` as part of the application) as
@@ -20,7 +19,7 @@ class TRTModuleNext(torch.nn.Module):
     The forward function is simpily forward(*args: torch.Tensor) -> Tuple[torch.Tensor] where
     the internal implementation is ``return Tuple(torch.ops.tensorrt.execute_engine(list(inputs), self.engine))``
 
-    > Note: TRTModuleNext only supports engines built with explict batch
+    > Note: TorchTensorRTModule only supports engines built with explict batch
 
     Attributes:
         name (str): Name of module (for easier debugging)
@@ -37,7 +36,7 @@ def __init__(
         output_binding_names: List[str] = [],
         target_device: Device = Device._current_device(),
     ):
-        """__init__ method for torch_tensorrt.TRTModuleNext
+        """__init__ method for torch_tensorrt.TorchTensorRTModule
 
         Takes a name, target device, serialized TensorRT engine, and binding names / order and constructs
         a PyTorch ``torch.nn.Module`` around it.
@@ -71,9 +70,9 @@ def __init__(
 
         """
         logger.warning(
-            "TRTModuleNext should be considered experimental stability, APIs are subject to change. Note: TRTModuleNext only supports engines built with explict batch"
+            "TorchTensorRTModule should be considered experimental stability, APIs are subject to change. Note: TorchTensorRTModule only supports engines built with explict batch"
         )
-        super(TRTModuleNext, self).__init__()
+        super(TorchTensorRTModule, self).__init__()
 
         if not isinstance(serialized_engine, bytearray):
             ValueError("Expected serialized engine as bytearray")
@@ -89,8 +88,8 @@ def __init__(
                     self.name + "_engine" if self.name != "" else "tensorrt_engine",
                     target_device._to_serialized_rt_device(),
                     serialized_engine,
-                    TRTModuleNext._pack_binding_names(self.input_binding_names),
-                    TRTModuleNext._pack_binding_names(self.output_binding_names),
+                    TorchTensorRTModule._pack_binding_names(self.input_binding_names),
+                    TorchTensorRTModule._pack_binding_names(self.output_binding_names),
                 ]
             )
         else:
@@ -154,7 +153,7 @@ def is_non_tensor(i: Tuple[Any, bool]) -> bool:
 
             non_tensors = [i[0] for i in filter(zip(inputs, types), is_non_tensor)]
             raise RuntimeError(
-                f"TRTModuleNext expects a flattened list of tensors as input, found non tensors: {non_tensors}"
+                f"TorchTensorRTModule expects a flattened list of tensors as input, found non tensors: {non_tensors}"
             )
 
         outputs = torch.ops.tensorrt.execute_engine(list(inputs), self.engine)
 
@@ -16,6 +16,7 @@
     WORKSPACE_SIZE,
     MIN_BLOCK_SIZE,
     PASS_THROUGH_BUILD_FAILURES,
+    USE_EXPERIMENTAL_RT,
 )
 
 
@@ -45,6 +46,7 @@ def compile(
     torch_executed_ops=[],
     torch_executed_modules=[],
     pass_through_build_failures=PASS_THROUGH_BUILD_FAILURES,
+    use_experimental_rt=USE_EXPERIMENTAL_RT,
     **kwargs,
 ):
     if debug:
@@ -57,6 +59,11 @@ def compile(
         + "torch_executed_ops, pass_through_build_failures}"
     )
 
+    if "use_experimental_fx_rt" in kwargs:
+        use_experimental_rt = kwargs["use_experimental_fx_rt"]
+
+    logger.info(f"Using {'C++' if use_experimental_rt else 'Python'} TRT Runtime")
+
     if not isinstance(inputs, collections.abc.Sequence):
         inputs = [inputs]
 
@@ -91,6 +98,7 @@ def compile(
         min_block_size=min_block_size,
         torch_executed_ops=torch_executed_ops,
         pass_through_build_failures=pass_through_build_failures,
+        use_experimental_rt=use_experimental_rt,
         **kwargs,
     )
 
@@ -114,6 +122,7 @@ def create_backend(
     min_block_size: int = MIN_BLOCK_SIZE,
     torch_executed_ops: Sequence[str] = set(),
     pass_through_build_failures: bool = PASS_THROUGH_BUILD_FAILURES,
+    use_experimental_rt: bool = USE_EXPERIMENTAL_RT,
     **kwargs,
 ):
     """Create torch.compile backend given specified arguments
@@ -125,6 +134,7 @@ def create_backend(
         min_block_size: Minimum number of operators per TRT-Engine Block
         torch_executed_ops: Sequence of operations to run in Torch, regardless of converter coverage
         pass_through_build_failures: Whether to fail on TRT engine build errors (True) or not (False)
+        use_experimental_rt: Whether to use the new experimental TRTModuleNext for TRT engines
     Returns:
         Backend for torch.compile
     """
@@ -136,4 +146,5 @@ def create_backend(
         min_block_size=min_block_size,
         torch_executed_ops=torch_executed_ops,
         pass_through_build_failures=pass_through_build_failures,
+        use_experimental_rt=use_experimental_rt,
     )
@@ -6,3 +6,4 @@
 WORKSPACE_SIZE = 0
 MIN_BLOCK_SIZE = 5
 PASS_THROUGH_BUILD_FAILURES = False
+USE_EXPERIMENTAL_RT = False
@@ -8,6 +8,7 @@
     WORKSPACE_SIZE,
     MIN_BLOCK_SIZE,
     PASS_THROUGH_BUILD_FAILURES,
+    USE_EXPERIMENTAL_RT,
 )
 
 
@@ -19,3 +20,4 @@ class CompilationSettings:
     min_block_size: int = MIN_BLOCK_SIZE
     torch_executed_ops: Sequence[str] = field(default_factory=set)
     pass_through_build_failures: bool = PASS_THROUGH_BUILD_FAILURES
+    use_experimental_rt: bool = USE_EXPERIMENTAL_RT
@@ -139,6 +139,7 @@ def _compile_module(
             submodule,
             submodule_inputs,
             settings=settings,
+            name=name,
         )
 
         trt_modules[name] = trt_mod
 
@@ -1,7 +1,7 @@
 from typing import Sequence, Union
 import torch
+import io
 from torch_tensorrt.fx.trt_module import TRTModule
-from torch_tensorrt import TRTModuleNext
 from torch_tensorrt.dynamo.backend._settings import CompilationSettings
 from torch_tensorrt.dynamo.fx_ts_compat.fx2trt import (
     InputTensorSpec,
@@ -15,12 +15,14 @@ def convert_module(
     module: torch.fx.GraphModule,
     inputs: Sequence[torch.Tensor],
     settings: CompilationSettings = CompilationSettings(),
-) -> Union[TRTModuleNext, TRTModule]:
+    name: str = "",
+):
     """Convert an FX module to a TRT module
     Args:
         module: FX GraphModule to convert
         inputs: Sequence of Tensors representing inputs to the module
         settings: Compilation settings
+        name: TRT engine name
     Returns:
         TRTModule or TRTModuleNext
     """
@@ -50,8 +52,21 @@ def convert_module(
         ),
     )
 
-    return TRTModule(
-        engine=interpreter_result.engine,
-        input_names=interpreter_result.input_names,
-        output_names=interpreter_result.output_names,
-    )
+    if settings.use_experimental_rt:
+        from torch_tensorrt.dynamo._TorchTensorRTModule import TorchTensorRTModule
+
+        with io.BytesIO() as engine_bytes:
+            engine_bytes.write(interpreter_result.engine.serialize())
+            engine_str = engine_bytes.getvalue()
+        return TorchTensorRTModule(
+            serialized_engine=engine_str,
+            name=name,
+            input_binding_names=interpreter_result.input_names,
+            output_binding_names=interpreter_result.output_names,
+        )
+    else:
+        return TRTModule(
+            engine=interpreter_result.engine,
+            input_names=interpreter_result.input_names,
+            output_names=interpreter_result.output_names,
+        )
@@ -0,0 +1,173 @@
+from torch_tensorrt.dynamo.backend.lowering import partition
+from torch.testing._internal.common_utils import run_tests, TestCase
+import torch
+from copy import deepcopy
+from torch_tensorrt.dynamo import compile
+from utils import lower_graph_testing
+from torch_tensorrt.dynamo.common_utils.test_utils import DECIMALS_OF_AGREEMENT
+
+
+class TestTRTModuleNextCompilation(TestCase):
+    def test_trt_module_next_full_support(self):
+        class FullySupportedMultiOp(torch.nn.Module):
+            def forward(self, x, y):
+                out = x - y
+                out = out + x
+                out = 2 * out
+                out = out + y
+                return torch.mean(out, dim=1)
+
+        fx_graph = torch.fx.symbolic_trace(FullySupportedMultiOp())
+        partitioned_graph = partition(deepcopy(fx_graph), min_block_size=3)
+
+        self.assertEquals(
+            len(list(partitioned_graph.named_children())),
+            1,
+            "All operators are supported, there should be one segment",
+        )
+
+        inputs = [
+            torch.randint(-5, 5, (16, 7), dtype=torch.float).cuda(),
+            torch.randint(-5, 5, (16, 7), dtype=torch.float).cuda(),
+        ]
+
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = compile(
+            fx_graph,
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+            torch_executed_ops={"torch.ops.aten.add.Tensor"},
+            use_experimental_rt=True,
+            debug=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"TRT outputs don't match with the original model.",
+        )
+
+    def test_trt_module_next_partial_support(self):
+        class PartiallySupportedMultiOp(torch.nn.Module):
+            def forward(self, x, y):
+                out = x - y
+                out = out - 3 * x
+                out = out + y
+                out = out.to(torch.float)
+                out = 2 * out
+                return torch.mean(out, dim=-1)
+
+        fx_graph = torch.fx.symbolic_trace(PartiallySupportedMultiOp())
+        unexpected_ops = {torch.ops.aten.add.Tensor}
+
+        inputs = [
+            torch.randint(-40, 40, (16, 7, 5), dtype=torch.int).cuda(),
+            torch.randint(1, 40, (16, 7, 5), dtype=torch.int).cuda(),
+        ]
+
+        (unexpected_ops_seen, _, partitioned_graphs,) = lower_graph_testing(
+            fx_graph,
+            inputs,
+            unexpected_ops=unexpected_ops,
+            min_block_size=1,
+            torch_executed_ops={"torch.ops.aten.add.Tensor"},
+            testing_partitioning=True,
+        )
+
+        self.assertEquals(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+        self.assertEquals(
+            len(partitioned_graphs),
+            1,
+            "Without control flow breaks, there should only be a single graph",
+        )
+        self.assertEquals(
+            len(list(partitioned_graphs[0].named_children())),
+            2,
+            "Certain operators are set to run in Torch, expected 2 segments",
+        )
+
+        torch._dynamo.reset()
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = compile(
+            fx_graph,
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+            torch_executed_ops={"torch.ops.aten.add.Tensor"},
+            use_experimental_rt=True,
+            debug=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"TRT outputs don't match with the original model.",
+        )
+
+
+class TestCompilationOptions(TestCase):
+    def test_trt_specific_options(self):
+        class SupportedMultiOp(torch.nn.Module):
+            def forward(self, x, y):
+                out = x - y
+                out = out - 3 * x
+                out = out + y
+                out = out - y / 5
+                out = 2 * out
+                return torch.mean(out, dim=-1)
+
+        fx_graph = torch.fx.symbolic_trace(SupportedMultiOp())
+
+        inputs = [
+            torch.randint(-40, 40, (16, 7, 5), dtype=torch.float).cuda(),
+            torch.randint(1, 40, (16, 7, 5), dtype=torch.float).cuda(),
+        ]
+
+        # Validate that the results between Torch and Torch-TRT are similar
+        optimized_model = compile(
+            fx_graph,
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+            use_experimental_rt=True,
+            optimization_level=4,
+            version_compatible=True,
+            max_aux_streams=5,
+            debug=True,
+        )
+        optimized_model_results = optimized_model(*inputs).detach().cpu()
+        torch_model_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(
+            torch.max(torch.abs(optimized_model_results - torch_model_results))
+        )
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"TRT outputs don't match with the original model.",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
Original file line number	Diff line number	Diff line change
`@@ -139,6 +139,7 @@ def _compile_module(`
`139`	`139`	`submodule,`
`140`	`140`	`submodule_inputs,`
`141`	`141`	`settings=settings,`
	`142`	`+ name=name,`
`142`	`143`	`)`
`143`	`144`
`144`	`145`	`trt_modules[name] = trt_mod`