[https://nvbugs/5983390][perf] Reduce host overhead in DSA MLA attention path

liji-nv · liji-nv · commit 44cdcabbe129 · 2026-03-31T08:05:54.000-07:00
Pass pre-computed num_contexts/num_ctx_tokens to thop::attention and
trtllm_gen_attention to eliminate per-layer sum().item() calls that
recompute batch structure from host_request_types/host_context_lengths.

Move view/slice/reinterpret ops from Python _update_k_cache into the
C++ indexer_k_cache_scatter_op kernel: accept original k_fp8 (FP8) and
k_scale (float32) tensors directly with num_tokens, avoiding per-layer
torch.empty, view, as_strided and slice overhead on the host.

Signed-off-by: Jin Li &lt;59594262+liji-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/nanobind/thop/bindings.cpp b/cpp/tensorrt_llm/nanobind/thop/bindings.cpp
@@ -70,7 +70,8 @@ void initBindings(nb::module_& m)
         nb::arg("cu_kv_seqlens") = std::nullopt, nb::arg("fmha_scheduler_counter") = std::nullopt,
         nb::arg("mla_bmm1_scale") = std::nullopt, nb::arg("mla_bmm2_scale") = std::nullopt,
         nb::arg("quant_q_buffer") = std::nullopt, nb::arg("flash_mla_tile_scheduler_metadata") = std::nullopt,
-        nb::arg("flash_mla_num_splits") = std::nullopt, "Multi-head attention operation",
+        nb::arg("flash_mla_num_splits") = std::nullopt, nb::arg("opt_num_contexts") = std::nullopt,
+        nb::arg("opt_num_ctx_tokens") = std::nullopt, "Multi-head attention operation",
         nb::call_guard<nb::gil_scoped_release>());
 
     m.def(
diff --git a/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp b/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp
@@ -28,69 +28,56 @@ TRTLLM_NAMESPACE_BEGIN
 namespace torch_ext
 {
 
-void indexer_k_cache_scatter_op(th::Tensor const& k_fp8_bytes, th::Tensor const& k_scale_bytes, th::Tensor& k_cache,
-    th::Tensor const& slot_mapping_fp8, th::Tensor const& slot_mapping_scale)
+void indexer_k_cache_scatter_op(th::Tensor const& k_fp8, th::Tensor const& k_scale, th::Tensor& k_cache,
+    th::Tensor const& slot_mapping_fp8, th::Tensor const& slot_mapping_scale, int64_t num_tokens)
 {
-    // Validate all tensors are CUDA tensors
-    TORCH_CHECK(k_fp8_bytes.is_cuda() && k_scale_bytes.is_cuda() && k_cache.is_cuda() && slot_mapping_fp8.is_cuda()
+    // k_fp8: [>=num_tokens, head_dim] in FP8 (1 byte/element) — reinterpreted as uint8
+    // k_scale: [>=num_tokens, head_dim // quant_block_size] in float32 — reinterpreted as uint8 bytes
+    // slot_mapping_fp8, slot_mapping_scale: [>=num_tokens] int64 — only first num_tokens used
+    // k_cache: [num_blocks, block_size, 1, per_token_size] uint8
+
+    TORCH_CHECK(k_fp8.is_cuda() && k_scale.is_cuda() && k_cache.is_cuda() && slot_mapping_fp8.is_cuda()
             && slot_mapping_scale.is_cuda(),
         "All tensors must be CUDA tensors");
 
-    // Validate tensor dimensions
-    TORCH_CHECK(k_fp8_bytes.dim() == 2, "k_fp8_bytes must be a 2D Tensor [num_tokens, head_dim]");
-    TORCH_CHECK(k_scale_bytes.dim() == 2, "k_scale_bytes must be a 2D Tensor [num_tokens, scale_size]");
-    TORCH_CHECK(slot_mapping_fp8.dim() == 1, "slot_mapping_fp8 must be a 1D Tensor [num_tokens]");
-    TORCH_CHECK(slot_mapping_scale.dim() == 1, "slot_mapping_scale must be a 1D Tensor [num_tokens]");
-
-    // Enforce k_cache is 4D tensor
-    TORCH_CHECK(k_cache.dim() == 4,
-        "k_cache must be a 4D Tensor [num_blocks, block_size, 1, per_token_size], got %d dimensions",
+    TORCH_CHECK(k_fp8.dim() == 2, "k_fp8 must be 2D [num_tokens, head_dim]");
+    TORCH_CHECK(k_scale.dim() == 2, "k_scale must be 2D [num_tokens, scale_elements]");
+    TORCH_CHECK(k_cache.dim() == 4, "k_cache must be 4D [num_blocks, block_size, 1, per_token_size], got %d dims",
         static_cast<int>(k_cache.dim()));
 
-    // Validate tensor dtypes
-    TORCH_CHECK(k_fp8_bytes.scalar_type() == torch::kUInt8, "k_fp8_bytes must be uint8");
-    TORCH_CHECK(k_scale_bytes.scalar_type() == torch::kUInt8, "k_scale_bytes must be uint8");
-    TORCH_CHECK(slot_mapping_fp8.scalar_type() == torch::kInt64, "slot_mapping_fp8 must be int64");
-    TORCH_CHECK(slot_mapping_scale.scalar_type() == torch::kInt64, "slot_mapping_scale must be int64");
-
-    // Validate tensor shapes are consistent
-    auto num_tokens = static_cast<int32_t>(k_fp8_bytes.size(0));
-    TORCH_CHECK(
-        k_scale_bytes.size(0) == num_tokens, "k_scale_bytes first dimension must equal k_fp8_bytes first dimension");
-    TORCH_CHECK(slot_mapping_fp8.size(0) == num_tokens, "slot_mapping_fp8 length must equal num_tokens");
-    TORCH_CHECK(slot_mapping_scale.size(0) == num_tokens, "slot_mapping_scale length must equal num_tokens");
-
-    // Validate tensors are contiguous (except k_cache which may be non-contiguous)
-    TORCH_CHECK(k_fp8_bytes.is_contiguous(), "k_fp8_bytes must be contiguous");
-    TORCH_CHECK(k_scale_bytes.is_contiguous(), "k_scale_bytes must be contiguous");
-    // k_cache can be non-contiguous - we handle this via strides
+    TORCH_CHECK(k_fp8.is_contiguous(), "k_fp8 must be contiguous");
+    TORCH_CHECK(k_scale.is_contiguous(), "k_scale must be contiguous");
     TORCH_CHECK(slot_mapping_fp8.is_contiguous(), "slot_mapping_fp8 must be contiguous");
     TORCH_CHECK(slot_mapping_scale.is_contiguous(), "slot_mapping_scale must be contiguous");
 
-    int32_t head_dim = static_cast<int32_t>(k_fp8_bytes.size(1));     // head_dim = quant_block_size = 128
-    int32_t scale_size = static_cast<int32_t>(k_scale_bytes.size(1)); // scale_size = 4 bytes
-
-    int32_t cache_dim_0 = static_cast<int32_t>(k_cache.size(0));      // num_blocks
-    int32_t cache_dim_1 = static_cast<int32_t>(k_cache.size(1));      // block_size
-    int32_t cache_dim_2 = static_cast<int32_t>(k_cache.size(2));      // num_kv_heads
-    int32_t cache_dim_3 = static_cast<int32_t>(k_cache.size(3));      // per_token_size
-
-    // Validation for indexer k cache pool for DeepSeek-V3.2 constraints
-    TORCH_CHECK(cache_dim_2 == 1, "k_cache dimension 2 must be 1 for DeepSeek-V3.2, got %d", cache_dim_2);
-    TORCH_CHECK(head_dim == 128, "k_fp8_bytes head_dim must be 128 for DeepSeek-V3.2, got %d", head_dim);
-    TORCH_CHECK(scale_size == 4, "k_scale_bytes scale_size must be 4 bytes for DeepSeek-V3.2, got %d", scale_size);
-
-    int64_t cache_stride_0 = static_cast<int64_t>(k_cache.stride(0));
-    int64_t cache_stride_1 = static_cast<int64_t>(k_cache.stride(1));
-    int64_t cache_stride_2 = static_cast<int64_t>(k_cache.stride(2));
-    int64_t cache_stride_3 = static_cast<int64_t>(k_cache.stride(3));
-
-    auto stream = at::cuda::getCurrentCUDAStream(k_fp8_bytes.get_device());
-
-    tk::invokeIndexerKCacheScatter(k_fp8_bytes.data_ptr<uint8_t>(), k_scale_bytes.data_ptr<uint8_t>(),
-        k_cache.data_ptr<uint8_t>(), slot_mapping_fp8.data_ptr<int64_t>(), slot_mapping_scale.data_ptr<int64_t>(),
-        num_tokens, head_dim, scale_size, cache_dim_0, cache_dim_1, cache_dim_2, cache_dim_3, cache_stride_0,
-        cache_stride_1, cache_stride_2, cache_stride_3, stream);
+    // FP8 is 1 byte per element, so head_dim in elements == head_dim in bytes.
+    int32_t const head_dim = static_cast<int32_t>(k_fp8.size(1));
+    // float32 scale: each element is 4 bytes.
+    int32_t const scale_size = static_cast<int32_t>(k_scale.size(1)) * 4;
+
+    int32_t const cache_dim_0 = static_cast<int32_t>(k_cache.size(0));
+    int32_t const cache_dim_1 = static_cast<int32_t>(k_cache.size(1));
+    int32_t const cache_dim_2 = static_cast<int32_t>(k_cache.size(2));
+    int32_t const cache_dim_3 = static_cast<int32_t>(k_cache.size(3));
+
+    TORCH_CHECK(cache_dim_2 == 1, "k_cache dimension 2 must be 1, got %d", cache_dim_2);
+    TORCH_CHECK(head_dim == 128, "k_fp8 head_dim must be 128, got %d", head_dim);
+    TORCH_CHECK(scale_size == 4, "k_scale scale_size must be 4 bytes, got %d", scale_size);
+
+    int64_t const cache_stride_0 = static_cast<int64_t>(k_cache.stride(0));
+    int64_t const cache_stride_1 = static_cast<int64_t>(k_cache.stride(1));
+    int64_t const cache_stride_2 = static_cast<int64_t>(k_cache.stride(2));
+    int64_t const cache_stride_3 = static_cast<int64_t>(k_cache.stride(3));
+
+    auto stream = at::cuda::getCurrentCUDAStream(k_fp8.get_device());
+
+    // Reinterpret k_fp8 as uint8 bytes and k_scale as raw bytes via data_ptr.
+    // For slot mappings, use data_ptr directly — only the first num_tokens entries are read.
+    tk::invokeIndexerKCacheScatter(reinterpret_cast<uint8_t const*>(k_fp8.data_ptr()),
+        reinterpret_cast<uint8_t const*>(k_scale.data_ptr()), k_cache.data_ptr<uint8_t>(),
+        slot_mapping_fp8.data_ptr<int64_t>(), slot_mapping_scale.data_ptr<int64_t>(), static_cast<int32_t>(num_tokens),
+        head_dim, scale_size, cache_dim_0, cache_dim_1, cache_dim_2, cache_dim_3, cache_stride_0, cache_stride_1,
+        cache_stride_2, cache_stride_3, stream);
 }
 
 } // namespace torch_ext
@@ -100,8 +87,8 @@ TRTLLM_NAMESPACE_END
 TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
     m.def(
-        "indexer_k_cache_scatter_op(Tensor k_fp8_bytes, Tensor k_scale_bytes, Tensor(a!) k_cache, "
-        "Tensor slot_mapping_fp8, Tensor slot_mapping_scale) -> ()");
+        "indexer_k_cache_scatter_op(Tensor k_fp8, Tensor k_scale, Tensor(a!) k_cache, "
+        "Tensor slot_mapping_fp8, Tensor slot_mapping_scale, int num_tokens) -> ()");
 }
 
 TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -630,7 +630,8 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
     std::optional<torch::Tensor> cu_q_seqlens, std::optional<torch::Tensor> cu_kv_seqlens,
     std::optional<torch::Tensor> fmha_scheduler_counter, std::optional<torch::Tensor> mla_bmm1_scale,
     std::optional<torch::Tensor> mla_bmm2_scale, std::optional<torch::Tensor> quant_q_buffer,
-    std::optional<torch::Tensor> flash_mla_tile_scheduler_metadata, std::optional<torch::Tensor> flash_mla_num_splits)
+    std::optional<torch::Tensor> flash_mla_tile_scheduler_metadata, std::optional<torch::Tensor> flash_mla_num_splits,
+    std::optional<int64_t> opt_num_contexts, std::optional<int64_t> opt_num_ctx_tokens)
 {
     TLLM_LOG_TRACE("Attention op starts at layer %d", layer_idx);
     // Use these tensors to infer if the attention is using KV cache
@@ -833,19 +834,28 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
     }
     bool const is_gen_only = attn_input_type == AttentionInputType::GenerationOnly;
 
-    int32_t num_contexts = 0;
-    // count context requests
-    for (int32_t idx = 0; idx < num_seqs; idx++)
+    int32_t num_contexts;
+    if (opt_num_contexts.has_value())
     {
-        if (request_types[idx] != RequestType::kCONTEXT)
+        num_contexts = static_cast<int32_t>(opt_num_contexts.value());
+    }
+    else
+    {
+        num_contexts = 0;
+        for (int32_t idx = 0; idx < num_seqs; idx++)
         {
-            break;
+            if (request_types[idx] != RequestType::kCONTEXT)
+            {
+                break;
+            }
+            ++num_contexts;
         }
-        ++num_contexts;
     }
     int32_t const num_generations = num_seqs - num_contexts;
     int32_t const num_tokens = qkv_or_q.size(0);
-    int32_t const num_ctx_tokens = host_context_lengths.slice(0, 0, num_contexts).sum().item<int32_t>();
+    int32_t const num_ctx_tokens = opt_num_ctx_tokens.has_value()
+        ? static_cast<int32_t>(opt_num_ctx_tokens.value())
+        : host_context_lengths.slice(0, 0, num_contexts).sum().item<int32_t>();
     int32_t const num_gen_tokens = is_gen_only ? num_tokens : num_tokens - num_ctx_tokens;
     auto const ctx_total_kv_len = host_total_kv_lens.index({0}).item<int32_t>();
     auto const gen_total_kv_len = host_total_kv_lens.index({1}).item<int32_t>();
diff --git a/cpp/tensorrt_llm/thop/attentionOp.h b/cpp/tensorrt_llm/thop/attentionOp.h
@@ -78,7 +78,8 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
     std::optional<torch::Tensor> fmha_scheduler_counter, std::optional<torch::Tensor> mla_bmm1_scale,
     std::optional<torch::Tensor> mla_bmm2_scale, std::optional<torch::Tensor> quant_q_buffer,
     std::optional<torch::Tensor> flash_mla_tile_scheduler_metadata = std::nullopt,
-    std::optional<torch::Tensor> flash_mla_num_splits = std::nullopt);
+    std::optional<torch::Tensor> flash_mla_num_splits = std::nullopt,
+    std::optional<int64_t> opt_num_contexts = std::nullopt, std::optional<int64_t> opt_num_ctx_tokens = std::nullopt);
 
 struct KvCachePoolPointers
 {
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -1356,34 +1356,18 @@ def _update_k_cache(self, k_fp8: torch.Tensor, k_scale: torch.Tensor,
         if metadata.kv_cache_manager is None or metadata.slot_mapping_fp8 is None:
             return
 
-        # [num_blocks, block_size, 1, per_token_size ]
         k_cache = metadata.kv_cache_manager.get_indexer_k_cache_buffers(
             self.layer_idx)
 
         num_tokens = k_fp8.shape[0]
-        head_dim = k_fp8.shape[1]
-        scale_size = k_scale.shape[1] * 4  # Convert to bytes (float32 = 4 bytes)
-
-        # Convert to bytes: flatten first, then view as uint8, then reshape
-        k_fp8_bytes = k_fp8.view(-1).view(torch.uint8).view(
-            num_tokens, head_dim)
-
-        # k_scale: for single-element tensors, contiguous() may be no-op
-        # Fix stride(-1) for byte-level view
-        k_scale_flat = k_scale.view(-1)
-        if k_scale_flat.stride(-1) != 1:
-            k_scale_flat = torch.as_strided(k_scale_flat.contiguous(),
-                                            size=(k_scale_flat.numel(), ),
-                                            stride=(1, ))
-        k_scale_bytes = k_scale_flat.view(torch.uint8).view(
-            num_tokens, scale_size)
-
-        # Use CUDA kernel to scatter FP8 and scale bytes into cache
-        flat_indices_fp8 = metadata.slot_mapping_fp8[:num_tokens]
-        flat_indices_scale = metadata.slot_mapping_scale[:num_tokens]
-        torch.ops.trtllm.indexer_k_cache_scatter_op(k_fp8_bytes, k_scale_bytes,
-                                                    k_cache, flat_indices_fp8,
-                                                    flat_indices_scale)
+
+        # The C++ op reinterprets k_fp8 (FP8) and k_scale (float32) as raw
+        # bytes internally and only reads the first num_tokens entries from
+        # the slot mapping buffers, avoiding Python-side view/slice overhead.
+        torch.ops.trtllm.indexer_k_cache_scatter_op(k_fp8, k_scale, k_cache,
+                                                    metadata.slot_mapping_fp8,
+                                                    metadata.slot_mapping_scale,
+                                                    num_tokens)
 
     def sparse_attn_indexer(
         self,
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -407,6 +407,9 @@ def run(
         mla_bmm1_scale: Optional[torch.Tensor] = None,
         mla_bmm2_scale: Optional[torch.Tensor] = None,
         quant_q_buffer: Optional[torch.Tensor] = None,
+        num_contexts: int = 0,
+        num_generations: int = 0,
+        num_ctx_tokens: int = 0,
     ):
         """
         Run the attention operation.
@@ -639,6 +642,9 @@ def run(
                 self.quant_config,
                 self.kv_cache_manager,
                 global_layer_idx=self.global_layer_idx,
+                num_contexts=num_contexts,
+                num_generations=num_generations,
+                num_ctx_tokens=num_ctx_tokens,
             )
         else:
             thop.attention(
@@ -722,6 +728,8 @@ def run(
                 quant_q_buffer,
                 self.flash_mla_tile_scheduler_metadata,
                 self.flash_mla_num_splits,
+                num_contexts,
+                num_ctx_tokens,
             )
 
         if self.print_skip_softmax_stat:
@@ -2049,7 +2057,10 @@ def forward(
                          fmha_scheduler_counter=fmha_scheduler_counter,
                          mla_bmm1_scale=mla_bmm1_scale,
                          mla_bmm2_scale=mla_bmm2_scale,
-                         quant_q_buffer=quant_q_buffer)
+                         quant_q_buffer=quant_q_buffer,
+                         num_contexts=metadata.num_contexts,
+                         num_generations=metadata.num_generations,
+                         num_ctx_tokens=metadata.num_ctx_tokens)
 
         if output_sf is None:
             return output
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm_gen.py b/tensorrt_llm/_torch/attention_backend/trtllm_gen.py
@@ -1562,6 +1562,9 @@ def trtllm_gen_attention(
     quant_config: Optional[QuantConfig],
     kv_cache_manager: Optional[KVCacheManager],
     global_layer_idx: Optional[int] = None,
+    num_contexts: int = 0,
+    num_generations: int = 0,
+    num_ctx_tokens: int = 0,
 ) -> None:
     """
     TrtLLM-Gen attention using flashinfer backend.
@@ -1691,9 +1694,6 @@ def trtllm_gen_attention(
     if attention_input_type is not None:
         attn_input_type = AttentionInputType(attention_input_type)
 
-    num_contexts, num_generations = _parse_request_types(host_request_types)
-
-    num_ctx_tokens = int(host_context_lengths[:num_contexts].sum()) if num_contexts > 0 else 0
     num_gen_tokens = num_tokens - num_ctx_tokens
 
     # Prepare Workspace
diff --git a/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py b/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py
@@ -703,7 +703,7 @@ def test_indexer_k_cache_scatter_custom_op():
                              dtype=torch.bfloat16)
     k_fp8, k_scale = fp8_utils.fp8_quantize_1x128_sf_transpose(k_original)
 
-    # Prepare byte-level data
+    # Prepare byte-level data for the Python reference path
     scale_size = k_scale.shape[1] * 4
     k_fp8_bytes = k_fp8.view(-1).view(torch.uint8).view(num_tokens, head_dim)
     k_scale_flat = k_scale.view(-1)
@@ -742,9 +742,10 @@ def test_indexer_k_cache_scatter_custom_op():
 
     # ========== Path 1: CUDA Kernel ==========
     print(f"\n=== Path 1: CUDA Kernel ===")
-    torch.ops.trtllm.indexer_k_cache_scatter_op(k_fp8_bytes, k_scale_bytes,
-                                                k_cache_cuda, flat_indices_fp8,
-                                                flat_indices_scale)
+    torch.ops.trtllm.indexer_k_cache_scatter_op(k_fp8, k_scale, k_cache_cuda,
+                                                metadata.slot_mapping_fp8,
+                                                metadata.slot_mapping_scale,
+                                                num_tokens)
     torch.cuda.synchronize()
     print(f"✓ CUDA kernel completed")
 

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,8 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to`
`78`	`78`	`std::optional<torch::Tensor> fmha_scheduler_counter, std::optional<torch::Tensor> mla_bmm1_scale,`
`79`	`79`	`std::optional<torch::Tensor> mla_bmm2_scale, std::optional<torch::Tensor> quant_q_buffer,`
`80`	`80`	`std::optional<torch::Tensor> flash_mla_tile_scheduler_metadata = std::nullopt,`
`81`		`- std::optional<torch::Tensor> flash_mla_num_splits = std::nullopt);`
	`81`	`+ std::optional<torch::Tensor> flash_mla_num_splits = std::nullopt,`
	`82`	`+ std::optional<int64_t> opt_num_contexts = std::nullopt, std::optional<int64_t> opt_num_ctx_tokens = std::nullopt);`
`82`	`83`
`83`	`84`	`struct KvCachePoolPointers`
`84`	`85`	`{`