NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 4 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
Lines changed: 4 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Lines changed: 3 additions & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp
Lines changed: 2 additions & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/kvCacheTransferManager.cpp
Lines changed: 2 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.cpp
Lines changed: 45 additions & 6 deletions b/‎cpp/tensorrt_llm/common/attentionOp.cpp
Lines changed: 45 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
Lines changed: 6 additions & 6 deletions b/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h
Lines changed: 6 additions & 6 deletions
@@ -456,6 +456,7 @@ class KVCacheBlockPool
 {
 public:
     SizeType32 numLayers;
+    SizeType32 kvFactor;
     SizeType32 numKvHeads;
     SizeType32 sizePerHead;
     SizeType32 tokensPerBlock;
@@ -469,10 +470,11 @@ class KVCacheBlockPool
     // FP4 KV caches have extra pools that contain second level scales for dequantization.
     bool containsBlockScales;
 
-    KVCacheBlockPool(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
-        SizeType32 quantSize, runtime::ITensor::SharedPtr primaryPtr = nullptr,
+    KVCacheBlockPool(SizeType32 numLayers, SizeType32 kvFactor, SizeType32 numKvHeads, SizeType32 sizePerHead,
+        SizeType32 tokensPerBlock, SizeType32 quantSize, runtime::ITensor::SharedPtr primaryPtr = nullptr,
         runtime::ITensor::SharedPtr secondaryPtr = nullptr, bool containsBlockScales = false)
         : numLayers(numLayers)
+        , kvFactor(kvFactor)
         , numKvHeads(numKvHeads)
         , sizePerHead(sizePerHead)
         , tokensPerBlock(tokensPerBlock)
 
@@ -557,7 +557,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
                 mLayerToPoolIndex[layerIdx] = poolIndex;
             }
         }
-        mPools.emplace_back(numLayers, numKvHeads, sizePerHead, tokensPerBlock, 1);
+        mPools.emplace_back(numLayers, mKVFactor, numKvHeads, sizePerHead, tokensPerBlock, 1);
         ++poolIndex;
     }
 
@@ -649,8 +649,8 @@ void WindowBlockManager::createBlockScalePools(SizeType32 quantBlockSize)
         TLLM_CHECK_WITH_INFO(kv_pool.blockSize % quantBlockSize == 0,
             "Cannot use FP4 quantization since kv_pool.blockSize is not divisible by FP4 quantBlockSize.");
 
-        mPools.emplace_back(kv_pool.numLayers, kv_pool.numKvHeads, kv_pool.sizePerHead, kv_pool.tokensPerBlock,
-            quantBlockSize,
+        mPools.emplace_back(kv_pool.numLayers, kv_pool.kvFactor, kv_pool.numKvHeads, kv_pool.sizePerHead,
+            kv_pool.tokensPerBlock, quantBlockSize,
             /*primaryPool=*/nullptr,
             /*secondaryPool=*/nullptr,
             /*containsBlockScales=*/true);
 
@@ -78,13 +78,14 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
             {
                 auto stream = (isOffload ? mOffloadManager : mOnboardManager).getStream().get();
                 int const numLayers = pools[poolIdx].numLayers;
+                int const kvFactor = pools[poolIdx].kvFactor;
                 int const numHeads = pools[poolIdx].numKvHeads;
                 int const sizePerHead = pools[poolIdx].sizePerHead;
                 auto shape = srcPtr->getShape();
                 TLLM_LOG_DEBUG("block.Shape = %s", srcPtr->toString(shape).c_str());
                 TLLM_CHECK_WITH_INFO(
                     shape.nbDims == 4, "Expected KVCache block to have 4 dimensions, but it has %d", shape.nbDims);
-                TLLM_CHECK_WITH_INFO((shape.d[0] == 1) && (shape.d[1] == numLayers) && (shape.d[2] == 2)
+                TLLM_CHECK_WITH_INFO((shape.d[0] == 1) && (shape.d[1] == numLayers) && (shape.d[2] == kvFactor)
                         && (shape.d[3] == numHeads * tokensPerBlock * sizePerHead),
                     "Block shape is incorrect");
                 TLLM_CHECK_WITH_INFO(numTokensToCopy <= tokensPerBlock,
 
@@ -17,6 +17,7 @@
 #include "attentionOp.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h"
 #include "tensorrt_llm/kernels/flashMLA/flash_mla.h"
@@ -1528,12 +1529,44 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
                     == false,
                 "Found invalid number (NaN or Inf) in " + beforeRopeStr);
         }
+
+        KVBlockArray mla_context_paged_kv_cache_buffer;
         if (mIsMLAEnabled)
         {
             params.mla_param->cache_type = cache_type;
             params.mla_param->cu_q_seqlens = cu_q_seqlens;
             params.mla_param->quant_scale_kv = params.kv_scale_orig_quant;
-            invokeMLARopeContext<T, KVCacheBuffer>(*params.mla_param, kv_cache_buffer, stream);
+            if (mPagedContextFMHA && mPagedKVCache)
+            {
+                TLLM_CHECK_WITH_INFO(params.mla_param->context_paged_kv_ptr != nullptr,
+                    "Paged kv cache is not set for MLA context kernel");
+                TLLM_CHECK_WITH_INFO(params.mla_param->context_kv_cache_block_offsets_ptr != nullptr,
+                    "Paged kv cache block offsets is not set for MLA context kernel");
+                // build another KVBlockArray for MLA context kernel to read paged kv cache, which is built by the
+                // PyTorch backend assume the dtype of paged kv cache is the same as the T
+                auto const elemSize = sizeof(T);
+                auto const headSize = params.mla_param->meta.qk_nope_head_dim + params.mla_param->meta.qk_rope_head_dim;
+                // mNumKVHeads is 1 for writing, we use mNumHeads for reading paged kv cache
+                auto sizePerToken = mNumHeads * headSize * elemSize;
+                auto maxBlocksPerSeq = params.mla_param->context_paged_kv_max_blocks_per_seq;
+                TLLM_LOG_DEBUG(
+                    "AttentionOp building KVBlockArray for MLA context kernel, elemSize: %d, headSize: %d, mNumHeads: "
+                    "%d, sizePerToken: %d, batchSize: %d, maxBlocksPerSeq: %d, tokensPerBlock: %d, maxAttentionWindow: "
+                    "%d, "
+                    "sinkTokenLen: %d, canUseOneMoreBlock: %d",
+                    elemSize, headSize, mNumHeads, sizePerToken, params.batch_size, maxBlocksPerSeq, mTokensPerBlock,
+                    params.cyclic_attention_window_size, params.sink_token_length, params.can_use_one_more_block);
+                mla_context_paged_kv_cache_buffer = KVBlockArray(params.batch_size, maxBlocksPerSeq, mTokensPerBlock,
+                    sizePerToken, params.cyclic_attention_window_size, params.max_cyclic_attention_window_size,
+                    params.sink_token_length, params.can_use_one_more_block, params.mla_param->context_paged_kv_ptr,
+                    nullptr,
+                    static_cast<KVBlockArray::DataType*>(params.mla_param->context_kv_cache_block_offsets_ptr));
+            }
+            else
+            {
+                // compute RoPE and set compressed_kv + k_pe by invokeMLARopeContext if not using paged context FMHA
+                invokeMLARopeContext<T, KVCacheBuffer>(*params.mla_param, kv_cache_buffer, stream);
+            }
         }
         else
         {
@@ -1596,7 +1629,15 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         fmhaParams.packedMaskPtr = params.attention_packed_mask;
         if constexpr (std::is_same_v<KVCacheBuffer, KVBlockArray>)
         {
-            fmhaParams.pagedKvCache = kv_cache_buffer;
+            if (mIsMLAEnabled && mPagedContextFMHA && mPagedKVCache)
+            {
+                fmhaParams.pagedKvCache = mla_context_paged_kv_cache_buffer;
+                fmhaParams.qPtr = reinterpret_cast<void const*>(attention_input);
+            }
+            else
+            {
+                fmhaParams.pagedKvCache = kv_cache_buffer;
+            }
         }
         fmhaParams.cuQSeqLenPtr = cu_q_seqlens;
         fmhaParams.kvSeqLenPtr = decoder_params.seqKVLengths;
@@ -1612,7 +1653,6 @@ int AttentionOp::enqueueContext(EnqueueContextParams<T> const& params, cudaStrea
         // Run the fmha kernel.
         mFmhaDispatcher->run(fmhaParams);
         sync_check_cuda_error(stream);
-
         // The kv cache might need to be updated after FMHA (only when sliding window attention + chunked context is
         // used together). Reuse the preprocessingParams.
         invokeKvCachePostprocessing(preprocessingParams, stream);
@@ -2418,9 +2458,8 @@ int AttentionOp::initialize() noexcept
         }
         else
         {
-            fmhaParams.attentionInputLayout = (mPagedKVCache && mPagedContextFMHA && !mIsMLAEnabled)
-                ? AttentionInputLayout::Q_PAGED_KV
-                : AttentionInputLayout::PACKED_QKV;
+            fmhaParams.attentionInputLayout = (mPagedKVCache && mPagedContextFMHA) ? AttentionInputLayout::Q_PAGED_KV
+                                                                                   : AttentionInputLayout::PACKED_QKV;
         }
         fmhaParams.isSPadded = !mRemovePadding;
         fmhaParams.numQHeads = mNumAttnHeads;
 
@@ -379,9 +379,9 @@ extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_so
 extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin[];
 extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin[];
 extern unsigned char cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin[];
-extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_sm90_cu_cubin[];
-extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_sm90_cu_cubin[];
+extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin[];
 extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin[];
+extern unsigned char cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin[];
 #endif
 
 #ifndef EXCLUDE_SM_89
@@ -1661,9 +1661,9 @@ extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcap
 extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_256_softcapping_sm90_cu_cubin_len;
 extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len;
 extern uint32_t cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len;
-extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_sm90_cu_cubin_len;
-extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_sm90_cu_cubin_len;
+extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin_len;
 extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin_len;
+extern uint32_t cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin_len;
 #endif
 
 #ifndef EXCLUDE_SM_89
@@ -3573,9 +3573,9 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2
 { DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 32, 128, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sliding_window_causal_softcapping_sm90_kernel_nl", 32768, 128, 64, 2, 0, false, true, false, true, true, false, true, false},
 { DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 1, 0, false, true, false, true, true, false, true, false},
 { DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 16, 256, 256, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin, cubin_fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_softcapping_sm90_cu_cubin_len, "fmha_v2_flash_attention_fp16_fp32_64_16_S_qkv_256_sliding_window_causal_softcapping_sm90_kernel_nl", 49152, 128, 64, 2, 0, false, true, false, true, true, false, true, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 0, false, true, false, true, true, true, false, false},
-{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_causal_sm90_kernel_nl_tiled", 81920, 128, 64, 1, 2, false, true, false, true, true, true, false, false},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_qkv_192x128_causal_tma_ws_sm90_kernel", 213248, 384, 64, 1, 0, false, true, true, true, false, false, false, false},
 { DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 64, 576, 512, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_64_S_q_paged_kv_576x512_sm90_kernel_nl_tiled", 49152, 128, 64, 0, 2, false, true, false, true, true, true, false, false},
+{ DATA_TYPE_BF16, DATA_TYPE_BF16, 0, 64, 128, 192, 128, 0, 0, 0, kSM_90, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin, cubin_fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_tma_ws_sm90_cu_cubin_len, "fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_192x128_causal_tma_ws_sm90_kernel", 213248, 384, 64, 1, 2, false, true, true, true, false, false, false, false},
 #endif
 
 #ifndef EXCLUDE_SM_89
Original file line number	Diff line number	Diff line change
`@@ -557,7 +557,7 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind`
`557`	`557`	`mLayerToPoolIndex[layerIdx] = poolIndex;`
`558`	`558`	`}`
`559`	`559`	`}`
`560`		`- mPools.emplace_back(numLayers, numKvHeads, sizePerHead, tokensPerBlock, 1);`
	`560`	`+ mPools.emplace_back(numLayers, mKVFactor, numKvHeads, sizePerHead, tokensPerBlock, 1);`
`561`	`561`	`++poolIndex;`
`562`	`562`	`}`
`563`	`563`
`@@ -649,8 +649,8 @@ void WindowBlockManager::createBlockScalePools(SizeType32 quantBlockSize)`
`649`	`649`	`TLLM_CHECK_WITH_INFO(kv_pool.blockSize % quantBlockSize == 0,`
`650`	`650`	`"Cannot use FP4 quantization since kv_pool.blockSize is not divisible by FP4 quantBlockSize.");`
`651`	`651`
`652`		`- mPools.emplace_back(kv_pool.numLayers, kv_pool.numKvHeads, kv_pool.sizePerHead, kv_pool.tokensPerBlock,`
`653`		`- quantBlockSize,`
	`652`	`+ mPools.emplace_back(kv_pool.numLayers, kv_pool.kvFactor, kv_pool.numKvHeads, kv_pool.sizePerHead,`
	`653`	`+ kv_pool.tokensPerBlock, quantBlockSize,`
`654`	`654`	`/primaryPool=/nullptr,`
`655`	`655`	`/secondaryPool=/nullptr,`
`656`	`656`	`/containsBlockScales=/true);`