chore: document the role of 'd2t'

ixlmar · ixlmar · commit f78c246845ae · 2025-10-08T18:44:45.000+02:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -1183,6 +1183,12 @@ def sample_async(
             model_outputs: dict[str, torch.Tensor],
             num_context_logits_prefix_sum: list[int],
             resource_manager: Optional[ResourceManager] = None) -> SampleState:
+        # NB: The sampler is either called directly by PyExecutor, for the target model,
+        #     or by ModelDrafter.prepare_draft_tokens(), for the draft model. In the former
+        #     case there are 1 + get_draft_token_length(request) tokens per request. In the
+        #     latter case, there is always only 1 token per request because draft
+        #     tokens are sampled one-by-one.
+
         requests = scheduled_requests.all_requests()
         new_tokens = self.store.new_tokens
         log_probs_host = self.log_probs_host(scheduled_requests)
@@ -1332,8 +1338,6 @@ def _sample_batched_by_strategy(
             requests, pin_memory=True)
         generator_cuda = self.get_generator(cuda_device)
 
-        # FIXME: This check should/could be performed in ModelDrafter.prepare_draft_tokens
-        #
         # NB: Currently, "d2t" is applied to draft tokens, but not to draft logits,
         #     breaking _process_draft_tokens_rejection_sampling.
         needs_d2t = "d2t" in model_outputs
@@ -1459,15 +1463,16 @@ def _sample_batched_by_strategy(
             (batch_req_indices, batch_next_tokens_cuda_int,
              batch_softmax_cuda), = batched_results
 
-        # FIXME: This should be done in ModelDrafter.prepare_draft_tokens, but for performance
-        #        parity py_draft_tokens might need to be replaced / backed by a torch.Tensor, so
-        #        that d2t can be applied in a batched manner similar to the code below.
+        # NB: 'd2t' contains offsets for transforming draft vocab token IDs into
+        #     the target vocab. This is used by Eagle3ForCausalLM, whose input domain
+        #     is the target vocab, whereas the output logits correspond to the draft
+        #     vocab. Since the inputs/outputs are linked by TorchSampler.update_requests,
+        #     they currently need to be handled within TorchSampler. Changing the model
+        #     outputs to use the target vocab would require inflating the logit tensors,
+        #     which is inefficient. Changing the inputs to use the draft vocab, might
+        #     be cleaner, but would require applying 'd2t' in multiple locations:
+        #           Prefill, Eagle3ForCausalLM embeddings, ModelDrafter
         if needs_d2t:
-            # NB: The sampler is either called directly by PyExecutor, for the target model,
-            #     or by ModelDrafter.prepare_draft_tokens(), for the draft model. In the former
-            #     case there are 1 + get_draft_token_length(request) tokens per request. In the
-            #     latter case, only there is always only 1 token per request because draft
-            #     tokens are sampled one-by-one.
             self._apply_d2t(batch_next_tokens_cuda_int, model_outputs)
 
         return _BatchedSamplingResult(
@@ -1909,7 +1914,6 @@ def sample_async(
         num_context_logits_prefix_sum: list[int],
         resource_manager: Optional[ResourceManager] = None
     ) -> SampleStateTRTLLM:
-
         batch_size = scheduled_requests.batch_size
         beam_width = self.beam_width(scheduled_requests.all_requests())
         if (batch_size > 1 and beam_width > 1