Skip to content

TensorRT 10.11 release updates #4455

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
# TensorRT OSS Release Changelog

## 10.11.0 GA - 2025-5-21

Key Features and Updates:

- Plugin changes
- Migrated `IPluginV2`-descendent version 1 of `modulatedDeformConvPlugin`, to version 2, which implements `IPluginV3`.
- Migrated `IPluginV2`-descendent version 1 of `DisentangledAttention_TRT`, to version 2, which implements `IPluginV3`.
- Migrated `IPluginV2`-descendent version 1 of `MultiscaleDeformableAttnPlugin_TRT`, to version 2, which implements `IPluginV3`.
- Note: The newer versions preserve the attributes and I/O of the corresponding older plugin version. The older plugin versions are deprecated and will be removed in a future release.
- Demo changes
- demoDiffusion
- Added support for Stable Diffusion 3.5-medium and 3.5-large pipelines in BF16 and FP16 precisions.
- Parser changes
- Added `kENABLE_UINT8_AND_ASYMMETRIC_QUANTIZATION_DLA` parser flag to enable UINT8 asymmetric quantization on engines targeting DLA.
- Removed restriction that inputs to `RandomNormalLike` and `RandomUniformLike` must be tensors.
- Clarified limitations of scan outputs for `Loop` nodes.

## 10.10.0 GA - 2025-4-28

Key Features and Updates:
Expand Down
24 changes: 15 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
include(cmake/modules/set_ifndef.cmake)
include(cmake/modules/find_library_create_target.cmake)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules)

set_ifndef(TRT_LIB_DIR ${CMAKE_BINARY_DIR})
set_ifndef(TRT_OUT_DIR ${CMAKE_BINARY_DIR})
Expand Down Expand Up @@ -47,10 +48,10 @@ else()
set(STATIC_LIB_EXT "a")
endif()

file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/NvInferVersion.h" VERSION_STRINGS REGEX "#define NV_TENSORRT_.*")
file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/NvInferVersion.h" VERSION_STRINGS REGEX "#define TRT_.*_ENTERPRISE")

foreach(TYPE MAJOR MINOR PATCH BUILD)
string(REGEX MATCH "NV_TENSORRT_${TYPE} [0-9]+" TRT_TYPE_STRING ${VERSION_STRINGS})
string(REGEX MATCH "TRT_${TYPE}_ENTERPRISE [0-9]+" TRT_TYPE_STRING ${VERSION_STRINGS})
string(REGEX MATCH "[0-9]+" TRT_${TYPE} ${TRT_TYPE_STRING})
endforeach(TYPE)

Expand Down Expand Up @@ -143,20 +144,25 @@ if(BUILD_PARSERS)
configure_protobuf(${PROTOBUF_VERSION})
endif()

# Define library names
set(TRT_NVINFER_NAME "nvinfer")
set(TRT_ONNXPARSER_NAME "nvonnxparser")

# Windows library names have major version appended.
if (MSVC)
set(nvinfer_lib_name "nvinfer_${TRT_SOVERSION}")
set(nvinfer_lib_name "${TRT_NVINFER_NAME}_${TRT_SOVERSION}${TRT_LIB_SUFFIX}")
set(nvinfer_plugin_lib_name "nvinfer_plugin_${TRT_SOVERSION}")
set(nvinfer_vc_plugin_lib_name "nvinfer_vc_plugin_${TRT_SOVERSION}")
set(nvonnxparser_lib_name "nvonnxparser_${TRT_SOVERSION}")
set(nvonnxparser_lib_name "${TRT_ONNXPARSER_NAME}_${TRT_SOVERSION}${TRT_LIB_SUFFIX}")

else()
set(nvinfer_lib_name "nvinfer")
set(nvinfer_lib_name ${TRT_NVINFER_NAME})
set(nvinfer_plugin_lib_name "nvinfer_plugin")
set(nvinfer_vc_plugin_lib_name "nvinfer_vc_plugin")
set(nvonnxparser_lib_name "nvonnxparser")
set(nvonnxparser_lib_name ${TRT_ONNXPARSER_NAME})
endif()

find_library_create_target(nvinfer ${nvinfer_lib_name} SHARED ${TRT_LIB_DIR})
find_library_create_target(nvinfer ${nvinfer_lib_name} SHARED "${TRT_LIB_DIR}")

if (DEFINED USE_CUGFX)
find_library(CUDART_LIB cugfx_dll HINTS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib/x64 lib64)
Expand Down Expand Up @@ -217,13 +223,13 @@ endif()
if(BUILD_PLUGINS)
add_subdirectory(plugin)
else()
find_library_create_target(nvinfer_plugin ${nvinfer_plugin_lib_name} SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
find_library_create_target(nvinfer_plugin ${nvinfer_plugin_lib_name} SHARED "${TRT_OUT_DIR}" "${TRT_LIB_DIR}")
endif()

if(BUILD_PARSERS)
add_subdirectory(parsers)
else()
find_library_create_target(nvonnxparser ${nvonnxparser_lib_name} SHARED ${TRT_OUT_DIR} ${TRT_LIB_DIR})
find_library_create_target(nvonnxparser ${nvonnxparser_lib_name} SHARED "${TRT_OUT_DIR}" "${TRT_LIB_DIR}")
endif()

if(BUILD_SAMPLES)
Expand Down
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ To build the TensorRT-OSS components, you will first need the following software

**TensorRT GA build**

- TensorRT v10.10.0.31
- TensorRT v10.11.0.33
- Available from direct download links listed below

**System Packages**
Expand Down Expand Up @@ -86,24 +86,24 @@ To build the TensorRT-OSS components, you will first need the following software

Else download and extract the TensorRT GA build from [NVIDIA Developer Zone](https://developer.nvidia.com) with the direct links below:

- [TensorRT 10.10.0.31 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/tars/TensorRT-10.10.0.31.Linux.x86_64-gnu.cuda-11.8.tar.gz)
- [TensorRT 10.10.0.31 for CUDA 12.9, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/tars/TensorRT-10.10.0.31.Linux.x86_64-gnu.cuda-12.9.tar.gz)
- [TensorRT 10.10.0.31 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/zip/TensorRT-10.10.0.31.Windows.win10.cuda-11.8.zip)
- [TensorRT 10.10.0.31 for CUDA 12.9, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.10.0/zip/TensorRT-10.10.0.31.Windows.win10.cuda-12.9.zip)
- [TensorRT 10.11.0.33 for CUDA 11.8, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-10.11.0.33.Linux.x86_64-gnu.cuda-11.8.tar.gz)
- [TensorRT 10.11.0.33 for CUDA 12.9, Linux x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/tars/TensorRT-10.11.0.33.Linux.x86_64-gnu.cuda-12.9.tar.gz)
- [TensorRT 10.11.0.33 for CUDA 11.8, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-11.8.zip)
- [TensorRT 10.11.0.33 for CUDA 12.9, Windows x86_64](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip)

**Example: Ubuntu 20.04 on x86-64 with cuda-12.9**

```bash
cd ~/Downloads
tar -xvzf TensorRT-10.10.0.31.Linux.x86_64-gnu.cuda-12.9.tar.gz
export TRT_LIBPATH=`pwd`/TensorRT-10.10.0.31
tar -xvzf TensorRT-10.11.0.33.Linux.x86_64-gnu.cuda-12.9.tar.gz
export TRT_LIBPATH=`pwd`/TensorRT-10.11.0.33
```

**Example: Windows on x86-64 with cuda-12.9**

```powershell
Expand-Archive -Path TensorRT-10.10.0.31.Windows.win10.cuda-12.9.zip
$env:TRT_LIBPATH="$pwd\TensorRT-10.10.0.31\lib"
Expand-Archive -Path TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip
$env:TRT_LIBPATH="$pwd\TensorRT-10.11.0.33\lib"
```

## Setting Up The Build Environment
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
10.10.0.31
10.11.0.33
41 changes: 41 additions & 0 deletions cmake/modules/ShouldCompileKernel.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Certain cubins are binary compatible between different SM versions, so they are reused.
# This function checks if a SM-named file should be compiled based on current SM enablement.
# Specifically, the SM80 files are compiled if either 80, 86, or 89 are enabled.
function(should_compile_kernel SM OUT_VAR)
# If the target SM is any of 80/86/89, we need to check if any of those are enabled in CMAKE_CUDA_ARCHITECTURES.
if((${SM} EQUAL 80) OR (${SM} EQUAL 86) OR (${SM} EQUAL 89))
list(FIND CMAKE_CUDA_ARCHITECTURES 80 SM80_INDEX)
list(FIND CMAKE_CUDA_ARCHITECTURES 86 SM86_INDEX)
list(FIND CMAKE_CUDA_ARCHITECTURES 89 SM89_INDEX)
if((NOT ${SM80_INDEX} EQUAL -1) OR
(NOT ${SM86_INDEX} EQUAL -1) OR
(NOT ${SM89_INDEX} EQUAL -1)
)
set(${OUT_VAR} TRUE PARENT_SCOPE)
else()
set(${OUT_VAR} FALSE PARENT_SCOPE)
endif()
else()
list(FIND CMAKE_CUDA_ARCHITECTURES ${SM} SM_INDEX)
if (NOT ${SM_INDEX} EQUAL -1)
set(${OUT_VAR} TRUE PARENT_SCOPE)
else()
set(${OUT_VAR} FALSE PARENT_SCOPE)
endif()
endif()
endfunction()
4 changes: 2 additions & 2 deletions demo/BERT/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ The following software version configuration has been tested:
| Software | Version |
| -------- | ------- |
| Python | >=3.8 |
| TensorRT | 10.9 |
| CUDA | 12.8 |
| TensorRT | 10.11 |
| CUDA | 12.9 |

## Setup

Expand Down
6 changes: 3 additions & 3 deletions demo/BERT/builder_varseqlen.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,8 @@ def build_engine(batch_sizes, workspace_size, sequence_length, config, weights_d
network_creation_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

with trt.Builder(TRT_LOGGER) as builder, builder.create_network(network_creation_flag) as network, builder.create_builder_config() as builder_config:
builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_size * (1024 * 1024))
if workspace_size is not None:
builder_config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace_size * (1024 * 1024))
builder_config.avg_timing_iterations = 8
if config.use_fp16:
builder_config.set_flag(trt.BuilderFlag.FP16)
Expand Down Expand Up @@ -571,8 +572,7 @@ def main():
parser.add_argument(
"-w",
"--workspace-size",
default=2500,
help="Workspace size in MiB for building the BERT engine (default: 2500)",
help="Workspace size in MiB for building the BERT engine (default: unlimited)",
type=int,
)
parser.add_argument(
Expand Down
22 changes: 15 additions & 7 deletions demo/Diffusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ This demo application ("demoDiffusion") showcases the acceleration of Stable Dif
### Clone the TensorRT OSS repository

```bash
git clone [email protected]:NVIDIA/TensorRT.git -b release/10.9 --single-branch
git clone [email protected]:NVIDIA/TensorRT.git -b release/10.11 --single-branch
cd TensorRT
```

Expand Down Expand Up @@ -49,7 +49,7 @@ onnx 1.15.0
onnx-graphsurgeon 0.5.2
onnxruntime 1.16.3
polygraphy 0.49.9
tensorrt 10.9.0.34
tensorrt 10.11.0.33
tokenizers 0.13.3
torch 2.2.0
transformers 4.42.2
Expand Down Expand Up @@ -199,19 +199,27 @@ Even faster image generation than LCM, producing coherent images in just 1 step.
python3 demo_txt2img_xl.py "Einstein" --version xl-turbo --onnx-dir onnx-sdxl-turbo --engine-dir engine-sdxl-turbo --denoising-steps 1 --scheduler EulerA --guidance-scale 0.0 --width 512 --height 512
```

### Generate an image guided by a text prompt using Stable Diffusion 3
### Generate an image guided by a text prompt using Stable Diffusion 3 and its variants

Run the command below to generate an image using Stable Diffusion 3
Run the command below to generate an image using Stable Diffusion 3 and Stable Diffusion 3.5

```bash
# Stable Diffusion 3
python3 demo_txt2img_sd3.py "A vibrant street wall covered in colorful graffiti, the centerpiece spells \"SD3 MEDIUM\", in a storm of colors" --version sd3 --hf-token=$HF_TOKEN

# Stable Diffusion 3.5-medium
python3 demo_txt2img_sd35.py "a beautiful photograph of Mt. Fuji during cherry blossom" --version=3.5-medium --denoising-steps=30 --guidance-scale 3.5 --hf-token=$HF_TOKEN

# Stable Diffusion 3.5-large
python3 demo_txt2img_sd35.py "a beautiful photograph of Mt. Fuji during cherry blossom" --version=3.5-large --denoising-steps=30 --guidance-scale 3.5 --hf-token=$HF_TOKEN
```

You can also specify an input image conditioning as shown below

```bash
wget https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png -O dog-on-bench.png

# Stable Diffusion 3
python3 demo_txt2img_sd3.py "dog wearing a sweater and a blue collar" --version sd3 --input-image dog-on-bench.png --hf-token=$HF_TOKEN
```

Expand Down Expand Up @@ -352,7 +360,7 @@ You can use the `--calibraton-dataset` flag to specify the path, which is set to
python3 demo_img2img_flux.py "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts." --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --bf16 --denoising-steps 30 --download-onnx-models

# FP8 using pre-exported ONNX models
python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch
python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch --quantization-level 4

# FP8 using native ONNX export
rm -rf onnx/* engine/* && python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-dev-depth" --hf-token=$HF_TOKEN --guidance-scale 10 --control-image robot.png --quantization-level 4 --fp8 --denoising-steps 30
Expand All @@ -368,13 +376,13 @@ python3 demo_img2img_flux.py "A robot made of exotic candies" --version="flux.1-
python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --bf16 --denoising-steps 30 --download-onnx-models

# FP8 using pre-exported ONNX models
python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch
python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp8 --denoising-steps 30 --download-onnx-models --build-static-batch --quantization-level 4

# FP8 using native ONNX export
rm -rf onnx/* engine/* && python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --quantization-level 4 --fp8 --denoising-steps 30 --calibration-dataset {custom/dataset/path}

# FP4
python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp4 --denoising-steps 30 --download-onnx-models
python3 demo_img2img_flux.py "a robot made out of gold" --version="flux.1-dev-canny" --hf-token=$HF_TOKEN --guidance-scale 30 --control-image robot.png --fp4 --denoising-steps 30 --download-onnx-models --build-static-batch
```

#### 4. Generate an Image Using Flux LoRA
Expand Down
8 changes: 5 additions & 3 deletions demo/Diffusion/demo_diffusion/dd_argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def add_arguments(parser):
"xl-turbo",
"svd-xt-1.1",
"sd3",
"3.5-medium",
"3.5-large",
"cascade",
"flux.1-dev",
"flux.1-schnell",
Expand Down Expand Up @@ -274,6 +276,7 @@ def process_pipeline_args(args: argparse.Namespace) -> Tuple[Dict[str, Any], Dic
sm_version = device_info.major * 10 + device_info.minor

is_flux = args.version.startswith("flux")
is_sd35 = args.version.startswith("3.5")

if args.height % 8 != 0 or args.width % 8 != 0:
raise ValueError(
Expand Down Expand Up @@ -336,7 +339,6 @@ def override_quant_level(level: float, dtype_str: str):
elif args.int8:
override_quant_level(3.0, "INT8")


if args.quantization_level == 3.0 and args.download_onnx_models:
raise ValueError(
"Transformer ONNX model for Quantization level 3 is not available for download. Please export the quantized Transformer model natively with the removal of --download-onnx-models."
Expand Down Expand Up @@ -366,7 +368,7 @@ def override_quant_level(level: float, dtype_str: str):

# Torch-fallback and Torch-inference
if args.torch_fallback and not args.torch_inference:
assert is_flux, "PyTorch Fallback is only supported for Flux pipelines"
assert is_flux or is_sd35, "PyTorch Fallback is only supported for Flux and Stable Diffusion 3.5 pipelines."
args.torch_fallback = args.torch_fallback.split(",")

if args.torch_fallback and args.torch_inference:
Expand All @@ -377,7 +379,7 @@ def override_quant_level(level: float, dtype_str: str):

# low-vram
if args.low_vram:
assert is_flux, "low-vram mode is only supported for Flux pipelines"
assert is_flux or is_sd35, "low-vram mode is only supported for Flux and Stable Diffusion 3.5 pipelines."

# Pack arguments
kwargs_init_pipeline = {
Expand Down
5 changes: 3 additions & 2 deletions demo/Diffusion/demo_diffusion/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,17 @@
from collections import OrderedDict, defaultdict

import numpy as np
import onnx
import tensorrt as trt
import torch
from cuda import cudart
from onnx import numpy_helper
from polygraphy.backend.common import bytes_from_path
from polygraphy.backend.trt import (
engine_from_bytes,
)

import onnx
from onnx import numpy_helper

TRT_LOGGER = trt.Logger(trt.Logger.ERROR)


Expand Down
2 changes: 2 additions & 0 deletions demo/Diffusion/demo_diffusion/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from demo_diffusion.model.diffusion_transformer import (
FluxTransformerModel,
SD3_MMDiTModel,
SD3TransformerModel,
)
from demo_diffusion.model.gan import VQGANModel
from demo_diffusion.model.load import unload_torch_model
Expand Down Expand Up @@ -67,6 +68,7 @@
# diffusion_transformer
"SD3_MMDiTModel",
"FluxTransformerModel",
"SD3TransformerModel",
# gan
"VQGANModel",
# lora
Expand Down
Loading