NVIDIA · asfiyab-nvidia · May 21, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
@@ -1,5 +1,22 @@
 # TensorRT OSS Release Changelog
 
+## 10.11.0 GA - 2025-5-21
+
+Key Features and Updates:
+
+- Plugin changes
+  - Migrated `IPluginV2`-descendent version 1 of `modulatedDeformConvPlugin`, to version 2, which implements `IPluginV3`.
+  - Migrated `IPluginV2`-descendent version 1 of `DisentangledAttention_TRT`, to version 2, which implements `IPluginV3`.
+  - Migrated `IPluginV2`-descendent version 1 of `MultiscaleDeformableAttnPlugin_TRT`, to version 2, which implements `IPluginV3`.
+  - Note: The newer versions preserve the attributes and I/O of the corresponding older plugin version. The older plugin versions are deprecated and will be removed in a future release.
+- Demo changes
+  - demoDiffusion
+    - Added support for Stable Diffusion 3.5-medium and 3.5-large pipelines in BF16 and FP16 precisions.
+- Parser changes
+  - Added `kENABLE_UINT8_AND_ASYMMETRIC_QUANTIZATION_DLA` parser flag to enable UINT8 asymmetric quantization on engines targeting DLA.
+  - Removed restriction that inputs to `RandomNormalLike` and `RandomUniformLike` must be tensors.
+  - Clarified limitations of scan outputs for `Loop` nodes.
+
 ## 10.10.0 GA - 2025-4-28
 
 Key Features and Updates:

@@ -18,6 +18,7 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 include(cmake/modules/set_ifndef.cmake)
 include(cmake/modules/find_library_create_target.cmake)
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules)
 
 set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
 set_ifndef(TRT_OUT_DIR ${CMAKE_BINARY_DIR})
@@ -47,10 +48,10 @@ else()
     set(STATIC_LIB_EXT "a")
 endif()
 
-file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/NvInferVersion.h" VERSION_STRINGS REGEX "#define NV_TENSORRT_.*")
+file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/NvInferVersion.h" VERSION_STRINGS REGEX "#define TRT_.*_ENTERPRISE")
 
 foreach(TYPE MAJOR MINOR PATCH BUILD)
-    string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING ${VERSION_STRINGS})
+    string(REGEX MATCH "TRT_${TYPE}_ENTERPRISE [0-9]+" TRT_TYPE_STRING ${VERSION_STRINGS})
     string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
 endforeach(TYPE)
 
@@ -143,20 +144,25 @@ if(BUILD_PARSERS)
     configure_protobuf(${PROTOBUF_VERSION})
 endif()
 
+# Define library names
+set(TRT_NVINFER_NAME "nvinfer")
+set(TRT_ONNXPARSER_NAME "nvonnxparser")
+
 # Windows library names have major version appended.
 if (MSVC)
-    set(nvinfer_lib_name "nvinfer_${TRT_SOVERSION}")
+    set(nvinfer_lib_name "${TRT_NVINFER_NAME}_${TRT_SOVERSION}${TRT_LIB_SUFFIX}")
     set(nvinfer_plugin_lib_name "nvinfer_plugin_${TRT_SOVERSION}")
     set(nvinfer_vc_plugin_lib_name "nvinfer_vc_plugin_${TRT_SOVERSION}")
-    set(nvonnxparser_lib_name "nvonnxparser_${TRT_SOVERSION}")
+    set(nvonnxparser_lib_name "${TRT_ONNXPARSER_NAME}_${TRT_SOVERSION}${TRT_LIB_SUFFIX}")
+
 else()
-    set(nvinfer_lib_name "nvinfer")
+    set(nvinfer_lib_name ${TRT_NVINFER_NAME})
     set(nvinfer_plugin_lib_name "nvinfer_plugin")
     set(nvinfer_vc_plugin_lib_name "nvinfer_vc_plugin")
-    set(nvonnxparser_lib_name "nvonnxparser")
+    set(nvonnxparser_lib_name ${TRT_ONNXPARSER_NAME})
 endif()
 
-find_library_create_target(nvinfer ${nvinfer_lib_name} SHARED ${TRT_LIB_DIR})
+find_library_create_target(nvinfer ${nvinfer_lib_name} SHARED "${TRT_LIB_DIR}")
 
 if (DEFINED USE_CUGFX)
     find_library(CUDART_LIB cugfx_dll HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib/x64 lib64)
@@ -217,13 +223,13 @@ endif()
 if(BUILD_PLUGINS)
     add_subdirectory(plugin)
 else()
-    find_library_create_target(nvinfer_plugin ${nvinfer_plugin_lib_name} SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
+    find_library_create_target(nvinfer_plugin ${nvinfer_plugin_lib_name} SHARED "${TRT_OUT_DIR}" "${TRT_LIB_DIR}")
 endif()
 
 if(BUILD_PARSERS)
     add_subdirectory(parsers)
 else()
-    find_library_create_target(nvonnxparser ${nvonnxparser_lib_name} SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
+    find_library_create_target(nvonnxparser ${nvonnxparser_lib_name} SHARED "${TRT_OUT_DIR}" "${TRT_LIB_DIR}")
 endif()
 
 if(BUILD_SAMPLES)

@@ -32,7 +32,7 @@ To build the TensorRT-OSS components, you will first need the following software
 
 **TensorRT GA build**
 
-- TensorRT v10.10.0.31
+- TensorRT v10.11.0.33
   - Available from direct download links listed below
 
 **System Packages**
@@ -86,24 +86,24 @@ To build the TensorRT-OSS components, you will first need the following software
 
    Else download and extract the TensorRT GA build from [NVIDIA Developer Zone](https://developer.nvidia.com) with the direct links below:
 
-   - [TensorRT 10.10.0.31 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/tars/TensorRT-10.10.0.31.Linux.x86_64-gnu.cuda-11.8.tar.gz)
-   - [TensorRT 10.10.0.31 for CUDA 12.9, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/tars/TensorRT-10.10.0.31.Linux.x86_64-gnu.cuda-12.9.tar.gz)
-   - [TensorRT 10.10.0.31 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/zip/TensorRT-10.10.0.31.Windows.win10.cuda-11.8.zip)
-   - [TensorRT 10.10.0.31 for CUDA 12.9, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/zip/TensorRT-10.10.0.31.Windows.win10.cuda-12.9.zip)
+   - [TensorRT 10.11.0.33 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-10.11.0.33.Linux.x86_64-gnu.cuda-11.8.tar.gz)
+   - [TensorRT 10.11.0.33 for CUDA 12.9, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-10.11.0.33.Linux.x86_64-gnu.cuda-12.9.tar.gz)
+   - [TensorRT 10.11.0.33 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-11.8.zip)
+   - [TensorRT 10.11.0.33 for CUDA 12.9, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip)
 
    **Example: Ubuntu 20.04 on x86-64 with cuda-12.9**
 
    ```bash
    cd ~/Downloads
-   tar -xvzf TensorRT-10.10.0.31.Linux.x86_64-gnu.cuda-12.9.tar.gz
-   export TRT_LIBPATH=`pwd`/TensorRT-10.10.0.31
+   tar -xvzf TensorRT-10.11.0.33.Linux.x86_64-gnu.cuda-12.9.tar.gz
+   export TRT_LIBPATH=`pwd`/TensorRT-10.11.0.33
    ```
 
    **Example: Windows on x86-64 with cuda-12.9**
 
    ```powershell
-   Expand-Archive -Path TensorRT-10.10.0.31.Windows.win10.cuda-12.9.zip
-   $env:TRT_LIBPATH="$pwd\TensorRT-10.10.0.31\lib"
+   Expand-Archive -Path TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip
+   $env:TRT_LIBPATH="$pwd\TensorRT-10.11.0.33\lib"
    ```
 
 ## Setting Up The Build Environment

@@ -1 +1 @@
-10.10.0.31
+10.11.0.33
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Certain cubins are binary compatible between different SM versions, so they are reused.
+# This function checks if a SM-named file should be compiled based on current SM enablement.
+# Specifically, the SM80 files are compiled if either 80, 86, or 89 are enabled.
+function(should_compile_kernel SM OUT_VAR)
+    # If the target SM is any of 80/86/89, we need to check if any of those are enabled in CMAKE_CUDA_ARCHITECTURES.
+    if((${SM} EQUAL 80) OR (${SM} EQUAL 86) OR (${SM} EQUAL 89))
+        list(FIND CMAKE_CUDA_ARCHITECTURES 80 SM80_INDEX)
+        list(FIND CMAKE_CUDA_ARCHITECTURES 86 SM86_INDEX)
+        list(FIND CMAKE_CUDA_ARCHITECTURES 89 SM89_INDEX)
+        if((NOT ${SM80_INDEX} EQUAL -1) OR
+           (NOT ${SM86_INDEX} EQUAL -1) OR
+           (NOT ${SM89_INDEX} EQUAL -1)
+        )
+            set(${OUT_VAR} TRUE PARENT_SCOPE)
+        else()
+            set(${OUT_VAR} FALSE PARENT_SCOPE)
+        endif()
+    else()
+        list(FIND CMAKE_CUDA_ARCHITECTURES ${SM} SM_INDEX)
+        if (NOT ${SM_INDEX} EQUAL -1)
+            set(${OUT_VAR} TRUE PARENT_SCOPE)
+        else()
+            set(${OUT_VAR} FALSE PARENT_SCOPE)
+        endif()
+    endif()
+endfunction()
@@ -73,8 +73,8 @@ The following software version configuration has been tested:
 | Software | Version |
 | -------- | ------- |
 | Python   | >=3.8   |
-| TensorRT | 10.9    |
-| CUDA     | 12.8    |
+| TensorRT | 10.11   |
+| CUDA     | 12.9    |
 
 ## Setup
 

@@ -431,7 +431,8 @@ def build_engine(batch_sizes, workspace_size, sequence_length, config, weights_d
         network_creation_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 
     with trt.Builder(TRT_LOGGER) as builder, builder.create_network(network_creation_flag) as network, builder.create_builder_config() as builder_config:
-        builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_size * (1024 * 1024))
+        if workspace_size is not None:
+            builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_size * (1024 * 1024))
         builder_config.avg_timing_iterations = 8
         if config.use_fp16:
             builder_config.set_flag(trt.BuilderFlag.FP16)
@@ -571,8 +572,7 @@ def main():
     parser.add_argument(
         "-w",
         "--workspace-size",
-        default=2500,
-        help="Workspace size in MiB for building the BERT engine (default: 2500)",
+	help="Workspace size in MiB for building the BERT engine (default: unlimited)",
         type=int,
     )
     parser.add_argument(

@@ -7,7 +7,7 @@ This demo application ("demoDiffusion") showcases the acceleration of Stable Dif
 ### Clone the TensorRT OSS repository
 
 ```bash
-git clone [email protected]:NVIDIA/TensorRT.git -b release/10.9 --single-branch
+git clone [email protected]:NVIDIA/TensorRT.git -b release/10.11 --single-branch
 cd TensorRT
 ```
 
@@ -49,7 +49,7 @@ onnx                1.15.0
 onnx-graphsurgeon   0.5.2
 onnxruntime         1.16.3
 polygraphy          0.49.9
-tensorrt            10.9.0.34
+tensorrt            10.11.0.33
 tokenizers          0.13.3
 torch               2.2.0
 transformers        4.42.2
@@ -199,19 +199,27 @@ Even faster image generation than LCM, producing coherent images in just 1 step.
 python3 demo_txt2img_xl.py "Einstein" --version xl-turbo --onnx-dir onnx-sdxl-turbo --engine-dir engine-sdxl-turbo --denoising-steps 1 --scheduler EulerA --guidance-scale 0.0 --width 512 --height 512
 ```
 
-### Generate an image guided by a text prompt using Stable Diffusion 3
+### Generate an image guided by a text prompt using Stable Diffusion 3 and its variants
 
-Run the command below to generate an image using Stable Diffusion 3
+Run the command below to generate an image using Stable Diffusion 3 and Stable Diffusion 3.5
 
 ```bash
+# Stable Diffusion 3
 python3 demo_txt2img_sd3.py "A vibrant street wall covered in colorful graffiti, the centerpiece spells \"SD3 MEDIUM\", in a storm of colors" --version sd3 --hf-token=$HF_TOKEN
+
+# Stable Diffusion 3.5-medium
+python3 demo_txt2img_sd35.py "a beautiful photograph of Mt. Fuji during cherry blossom" --version=3.5-medium --denoising-steps=30 --guidance-scale 3.5 --hf-token=$HF_TOKEN
+
+# Stable Diffusion 3.5-large
+python3 demo_txt2img_sd35.py "a beautiful photograph of Mt. Fuji during cherry blossom" --version=3.5-large --denoising-steps=30 --guidance-scale 3.5 --hf-token=$HF_TOKEN
 ```
 
 You can also specify an input image conditioning as shown below
 
 ```bash
 wget https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png -O dog-on-bench.png
 
+# Stable Diffusion 3
 python3 demo_txt2img_sd3.py "dog wearing a sweater and a blue collar" --version sd3 --input-image dog-on-bench.png --hf-token=$HF_TOKEN
 ```
 
@@ -352,7 +360,7 @@ You can use the `--calibraton-dataset` flag to specify the path, which is set to
 python3 demo_img2img_flux.py "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts." --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --bf16 --denoising-steps 30  --download-onnx-models
 
 # FP8 using pre-exported ONNX models
-python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch
+python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch --quantization-level 4
 
 # FP8 using native ONNX export
 rm -rf onnx/* engine/* && python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --quantization-level 4 --fp8 --denoising-steps 30
@@ -368,13 +376,13 @@ python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-
 python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --bf16 --denoising-steps 30 --download-onnx-models
 
 # FP8 using pre-exported ONNX models
-python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch
+python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch --quantization-level 4
 
 # FP8 using native ONNX export
 rm -rf onnx/* engine/* && python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --quantization-level 4 --fp8 --denoising-steps 30 --calibration-dataset {custom/dataset/path}
 
 # FP4
-python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp4 --denoising-steps 30 --download-onnx-models
+python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp4 --denoising-steps 30 --download-onnx-models --build-static-batch
 ```
 
 #### 4. Generate an Image Using Flux LoRA

@@ -71,6 +71,8 @@ def add_arguments(parser):
             "xl-turbo",
             "svd-xt-1.1",
             "sd3",
+            "3.5-medium",
+            "3.5-large",
             "cascade",
             "flux.1-dev",
             "flux.1-schnell",
@@ -274,6 +276,7 @@ def process_pipeline_args(args: argparse.Namespace) -> Tuple[Dict[str, Any], Dic
     sm_version = device_info.major * 10 + device_info.minor
 
     is_flux = args.version.startswith("flux")
+    is_sd35 = args.version.startswith("3.5")
 
     if args.height % 8 != 0 or args.width % 8 != 0:
         raise ValueError(
@@ -336,7 +339,6 @@ def override_quant_level(level: float, dtype_str: str):
         elif args.int8:
             override_quant_level(3.0, "INT8")
 
-
     if args.quantization_level == 3.0 and args.download_onnx_models:
         raise ValueError(
             "Transformer ONNX model for Quantization level 3 is not available for download. Please export the quantized Transformer model natively with the removal of --download-onnx-models."
@@ -366,7 +368,7 @@ def override_quant_level(level: float, dtype_str: str):
 
     # Torch-fallback and Torch-inference
     if args.torch_fallback and not args.torch_inference:
-        assert is_flux, "PyTorch Fallback is only supported for Flux pipelines"
+        assert is_flux or is_sd35, "PyTorch Fallback is only supported for Flux and Stable Diffusion 3.5 pipelines."
         args.torch_fallback = args.torch_fallback.split(",")
 
     if args.torch_fallback and args.torch_inference:
@@ -377,7 +379,7 @@ def override_quant_level(level: float, dtype_str: str):
 
     # low-vram
     if args.low_vram:
-        assert is_flux, "low-vram mode is only supported for Flux pipelines"
+        assert is_flux or is_sd35, "low-vram mode is only supported for Flux and Stable Diffusion 3.5 pipelines."
 
     # Pack arguments
     kwargs_init_pipeline = {

@@ -22,16 +22,17 @@
 from collections import OrderedDict, defaultdict
 
 import numpy as np
-import onnx
 import tensorrt as trt
 import torch
 from cuda import cudart
-from onnx import numpy_helper
 from polygraphy.backend.common import bytes_from_path
 from polygraphy.backend.trt import (
     engine_from_bytes,
 )
 
+import onnx
+from onnx import numpy_helper
+
 TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
 
 

@@ -29,6 +29,7 @@
 from demo_diffusion.model.diffusion_transformer import (
     FluxTransformerModel,
     SD3_MMDiTModel,
+    SD3TransformerModel,
 )
 from demo_diffusion.model.gan import VQGANModel
 from demo_diffusion.model.load import unload_torch_model
@@ -67,6 +68,7 @@
     # diffusion_transformer
     "SD3_MMDiTModel",
     "FluxTransformerModel",
+    "SD3TransformerModel",
     # gan
     "VQGANModel",
     # lora