Skip to content

Commit 17770ee

Browse files
[https://nvbugs/6143599][fix] DeepSeek-V3 OOM and artifacts path
- Lower kv_cache_free_gpu_memory_fraction from 0.85 to 0.75 for DeepSeek-V3/R1; the previous fraction left no headroom for the transient DeepGEMM MoE workspace and OOM'd at max_batch_size=2048. - Set PYTORCH_ALLOC_CONF=expandable_segments:True for DeepSeek-V3/R1 to reduce CUDA allocator fragmentation under stress. - Add ARTIFACTS_DIR constant anchored to this file's location; pass it to aiperf via --output-artifact-dir and use it as the default reader path in extract_stress_test_metrics, so writes and reads stay aligned independent of pytest cwd. Signed-off-by: Wangshanshan <[email protected]>
1 parent 45d15a1 commit 17770ee

1 file changed

Lines changed: 13 additions & 3 deletions

File tree

tests/integration/defs/stress_test/stress_test.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@
7272
# Define a constant for process termination timeouts
7373
GRACEFUL_TERMINATION_TIMEOUT = 300 # seconds - set longer when stress large model
7474

75+
# Single source of truth for aiperf artifact location.
76+
# Passed to aiperf via --output-artifact-dir so writes and reads stay aligned
77+
# regardless of the pytest cwd.
78+
ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
79+
"artifacts")
80+
7581

7682
def _get_default_port() -> int:
7783
"""Get a default port using CI allocation if available, otherwise use 8000."""
@@ -571,6 +577,9 @@ def stress_test(config,
571577

572578
# For DeepSeek-V3 or DeepSeek-R1 specific server parameters
573579
if "DeepSeek-V3" in config.model_dir or "DeepSeek-R1" in config.model_dir:
580+
# Reduce CUDA allocator fragmentation so transient MoE workspace
581+
# allocations don't OOM when KV cache reservation is large.
582+
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
574583
test_server_config = ServerConfig(
575584
port=test_server_config.port,
576585
host=test_server_config.host,
@@ -582,7 +591,7 @@ def stress_test(config,
582591
max_num_tokens=
583592
8192, # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
584593
kv_cache_free_gpu_memory_fraction=
585-
0.85, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
594+
0.75, # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
586595
capacity_scheduler_policy=test_server_config.
587596
capacity_scheduler_policy,
588597
wait_interval=test_server_config.wait_interval,
@@ -954,6 +963,8 @@ def create_aiperf_command(model_name,
954963
str(request_count),
955964
"--concurrency",
956965
str(concurrency),
966+
"--output-artifact-dir",
967+
ARTIFACTS_DIR,
957968
# "--verbose",
958969
]
959970

@@ -1365,8 +1376,7 @@ def extract_stress_test_metrics(artifacts_dir=None, current_model=None):
13651376
# For local testing, the artifacts are at
13661377
# artifacts_dir = os.path.join(script_dir, "artifacts")
13671378
if artifacts_dir is None:
1368-
script_dir = os.path.dirname(os.path.abspath(__file__))
1369-
artifacts_dir = os.path.join(script_dir, "..", "artifacts")
1379+
artifacts_dir = ARTIFACTS_DIR
13701380

13711381
# Find all profile_export_aiperf.json files in the artifacts directory
13721382
json_files = glob(os.path.join(artifacts_dir,

0 commit comments

Comments
 (0)