Skip to content

Commit fff5fb5

Browse files
authored
[Graph Optimization] Refactor default capture list (#4617)
* fix bug and refine code * add debug count * refine code
1 parent 0a0c74e commit fff5fb5

File tree

3 files changed

+36
-15
lines changed

3 files changed

+36
-15
lines changed

fastdeploy/config.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -862,7 +862,7 @@ def init_with_cudagrpah_size(self, max_capture_size: int = 0) -> None:
862862
self.real_shape_to_captured_size[bs] = end
863863
self.real_shape_to_captured_size[self.max_capture_size] = self.max_capture_size
864864

865-
def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
865+
def _set_cudagraph_sizes(self, max_capture_size: int = 0):
866866
"""
867867
Calculate a series of candidate capture sizes,
868868
and then extract a portion of them as the capture list for the CUDA graph based on user input.
@@ -874,7 +874,7 @@ def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
874874
# Shape [256, 288, ... 992, 1024]
875875
draft_capture_sizes += [32 * i for i in range(9, 33)]
876876

877-
draft_capture_sizes.append(max_num_seqs)
877+
draft_capture_sizes.append(max_capture_size)
878878
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
879879

880880
def to_json_string(self):
@@ -1391,19 +1391,22 @@ def __init__(
13911391
self.cache_config: CacheConfig = cache_config # type: ignore
13921392
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
13931393
self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
1394-
# Initialize cuda graph capture list
1395-
if self.graph_opt_config.cudagraph_capture_sizes is None:
1396-
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
13971394

1395+
# Initialize cuda graph capture list
1396+
max_capture_shape = self.scheduler_config.max_num_seqs
1397+
if self.speculative_config is not None and self.speculative_config.method == "mtp":
1398+
max_capture_shape = self.scheduler_config.max_num_seqs * (
1399+
self.speculative_config.num_speculative_tokens + 1
1400+
)
1401+
assert max_capture_shape % 2 == 0, "CUDAGraph only supports capturing even token nums in MTP scenarios."
13981402
if self.graph_opt_config.cudagraph_only_prefill:
1399-
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
1400-
elif self.speculative_config is not None and self.speculative_config.method == "mtp":
1401-
max_shape = self.scheduler_config.max_num_seqs * (self.speculative_config.num_speculative_tokens + 1)
1402-
if max_shape % 2 == 1:
1403-
max_shape = max_shape + 1
1404-
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=min(512, max_shape))
1403+
max_capture_shape = 512
14051404
else:
1406-
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
1405+
max_capture_shape = min(512, max_capture_shape)
1406+
1407+
if self.graph_opt_config.cudagraph_capture_sizes is None:
1408+
self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
1409+
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)
14071410

14081411
self.tokenizer = tokenizer
14091412
self.ips = ips

fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
171171
for n in range(entry.num_finished_warmup, self.warm_up_size):
172172
entry.num_finished_warmup += 1
173173
entry.runnable(**kwargs)
174-
logger.debug(
174+
logger.info(
175175
f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
176176
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
177177
)
@@ -207,7 +207,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
207207

208208
# For CUDAGraph debug
209209
# self._save_cudagrpah_dot_files(entry)
210-
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
210+
logger.info(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
211211

212212
# Replay
213213
entry.cuda_graph.replay()
@@ -224,7 +224,7 @@ def _create_entry_dict(self):
224224
for shape in self.cudagraph_capture_sizes:
225225
self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)
226226

227-
logger.debug(
227+
logger.info(
228228
f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
229229
"Created all real shape entry."
230230
)
@@ -254,3 +254,9 @@ def _save_cudagrpah_dot_files(self, entry):
254254
f"{log_dir}/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}",
255255
1 << 0,
256256
)
257+
258+
def check_capture_successful(self):
259+
"""Check whether the shapes are captured or not"""
260+
for shape, entry in self.concrete_size_entries.items():
261+
if not entry.captured:
262+
raise ValueError(f"[CUDA GRAPH][ID:{id(self)}] Shape {shape} capture failed.")

fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@
3434
from fastdeploy.model_executor.graph_optimization.utils import (
3535
in_sot_warmup_mode as in_warmup_mode,
3636
)
37+
from fastdeploy.utils import get_logger
38+
39+
logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
40+
3741

3842
P = ParamSpec("P")
3943
T = TypeVar("T")
@@ -105,6 +109,9 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
105109
self.dy_runnable = self.runnable
106110
self.fd_config = fd_config
107111
self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
112+
self._debug_count_cudagraph_replay = 0
113+
self._debug_count_total_step = 0
114+
108115
if self.fd_config.graph_opt_config.graph_opt_level > 0:
109116
# 1. Prepare cuda graph input buffers (contain output of subgraphs)
110117

@@ -123,6 +130,7 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
123130
)
124131

125132
def __call__(self, **kwargs):
133+
self._debug_count_total_step += 1
126134
if not self.fd_config.graph_opt_config.use_cudagraph:
127135
return self.runnable(**kwargs)
128136
if self.cudagraph_piecewise_backend is None:
@@ -136,6 +144,10 @@ def __call__(self, **kwargs):
136144
if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.cudagraph_switch_threshold):
137145
return self.dy_runnable(**kwargs)
138146
else:
147+
self._debug_count_cudagraph_replay += 1
148+
logger.debug(
149+
f"[CUDA GRAPH][ID:{id(self.cudagraph_piecewise_backend)}] Total step count: {self._debug_count_total_step}, CUDAGraph replay count: {self._debug_count_cudagraph_replay}"
150+
)
139151
return self.cudagraph_piecewise_backend.__call__(**kwargs)
140152

141153
def clear_cudagraph_piecewise_backend(self):

0 commit comments

Comments
 (0)