[Graph Optimization] Refactor default capture list (#4617)

gongshaotian · web-flow · commit fff5fb5e3924 · 2025-10-28T21:31:02.000+08:00
* fix bug and refine code

* add debug count

* refine code
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -862,7 +862,7 @@ def init_with_cudagrpah_size(self, max_capture_size: int = 0) -> None:
                     self.real_shape_to_captured_size[bs] = end
         self.real_shape_to_captured_size[self.max_capture_size] = self.max_capture_size
 
-    def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
+    def _set_cudagraph_sizes(self, max_capture_size: int = 0):
         """
         Calculate a series of candidate capture sizes,
         and then extract a portion of them as the capture list for the CUDA graph based on user input.
@@ -874,7 +874,7 @@ def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
         # Shape [256, 288, ... 992, 1024]
         draft_capture_sizes += [32 * i for i in range(9, 33)]
 
-        draft_capture_sizes.append(max_num_seqs)
+        draft_capture_sizes.append(max_capture_size)
         self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
 
     def to_json_string(self):
@@ -1391,19 +1391,22 @@ def __init__(
         self.cache_config: CacheConfig = cache_config  # type: ignore
         self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
         self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
-        # Initialize cuda graph capture list
-        if self.graph_opt_config.cudagraph_capture_sizes is None:
-            self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
 
+        # Initialize cuda graph capture list
+        max_capture_shape = self.scheduler_config.max_num_seqs
+        if self.speculative_config is not None and self.speculative_config.method == "mtp":
+            max_capture_shape = self.scheduler_config.max_num_seqs * (
+                self.speculative_config.num_speculative_tokens + 1
+            )
+            assert max_capture_shape % 2 == 0, "CUDAGraph only supports capturing even token nums in MTP scenarios."
         if self.graph_opt_config.cudagraph_only_prefill:
-            self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
-        elif self.speculative_config is not None and self.speculative_config.method == "mtp":
-            max_shape = self.scheduler_config.max_num_seqs * (self.speculative_config.num_speculative_tokens + 1)
-            if max_shape % 2 == 1:
-                max_shape = max_shape + 1
-            self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=min(512, max_shape))
+            max_capture_shape = 512
         else:
-            self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
+            max_capture_shape = min(512, max_capture_shape)
+
+        if self.graph_opt_config.cudagraph_capture_sizes is None:
+            self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
+        self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)
 
         self.tokenizer = tokenizer
         self.ips = ips
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -171,7 +171,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
             for n in range(entry.num_finished_warmup, self.warm_up_size):
                 entry.num_finished_warmup += 1
                 entry.runnable(**kwargs)
-                logger.debug(
+                logger.info(
                     f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
                     f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                 )
@@ -207,7 +207,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
 
             # For CUDAGraph debug
             # self._save_cudagrpah_dot_files(entry)
-            logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
+            logger.info(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
 
         # Replay
         entry.cuda_graph.replay()
@@ -224,7 +224,7 @@ def _create_entry_dict(self):
         for shape in self.cudagraph_capture_sizes:
             self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)
 
-        logger.debug(
+        logger.info(
             f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
             "Created all real shape entry."
         )
@@ -254,3 +254,9 @@ def _save_cudagrpah_dot_files(self, entry):
                 f"{log_dir}/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}",
                 1 << 0,
             )
+
+    def check_capture_successful(self):
+        """Check whether the shapes are captured or not"""
+        for shape, entry in self.concrete_size_entries.items():
+            if not entry.captured:
+                raise ValueError(f"[CUDA GRAPH][ID:{id(self)}] Shape {shape} capture failed.")
diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py
@@ -34,6 +34,10 @@
 from fastdeploy.model_executor.graph_optimization.utils import (
     in_sot_warmup_mode as in_warmup_mode,
 )
+from fastdeploy.utils import get_logger
+
+logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
+
 
 P = ParamSpec("P")
 T = TypeVar("T")
@@ -105,6 +109,9 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
         self.dy_runnable = self.runnable
         self.fd_config = fd_config
         self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
+        self._debug_count_cudagraph_replay = 0
+        self._debug_count_total_step = 0
+
         if self.fd_config.graph_opt_config.graph_opt_level > 0:
             # 1. Prepare cuda graph input buffers (contain output of subgraphs)
 
@@ -123,6 +130,7 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
         )
 
     def __call__(self, **kwargs):
+        self._debug_count_total_step += 1
         if not self.fd_config.graph_opt_config.use_cudagraph:
             return self.runnable(**kwargs)
         if self.cudagraph_piecewise_backend is None:
@@ -136,6 +144,10 @@ def __call__(self, **kwargs):
         if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.cudagraph_switch_threshold):
             return self.dy_runnable(**kwargs)
         else:
+            self._debug_count_cudagraph_replay += 1
+            logger.debug(
+                f"[CUDA GRAPH][ID:{id(self.cudagraph_piecewise_backend)}] Total step count: {self._debug_count_total_step}, CUDAGraph replay count: {self._debug_count_cudagraph_replay}"
+            )
             return self.cudagraph_piecewise_backend.__call__(**kwargs)
 
     def clear_cudagraph_piecewise_backend(self):