[https://nvbugs/6143599][fix] DeepSeek-V3 OOM and artifacts path

dominicshanshan · dominicshanshan · commit 17770ee95afb · 2026-05-17T18:28:19.000-07:00
- Lower kv_cache_free_gpu_memory_fraction from 0.85 to 0.75 for
  DeepSeek-V3/R1; the previous fraction left no headroom for the
  transient DeepGEMM MoE workspace and OOM'd at max_batch_size=2048.
- Set PYTORCH_ALLOC_CONF=expandable_segments:True for DeepSeek-V3/R1
  to reduce CUDA allocator fragmentation under stress.
- Add ARTIFACTS_DIR constant anchored to this file's location; pass it
  to aiperf via --output-artifact-dir and use it as the default reader
  path in extract_stress_test_metrics, so writes and reads stay aligned
  independent of pytest cwd.

Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
@@ -72,6 +72,12 @@
 # Define a constant for process termination timeouts
 GRACEFUL_TERMINATION_TIMEOUT = 300  # seconds - set longer when stress large model
 
+# Single source of truth for aiperf artifact location.
+# Passed to aiperf via --output-artifact-dir so writes and reads stay aligned
+# regardless of the pytest cwd.
+ARTIFACTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                             "artifacts")
+
 
 def _get_default_port() -> int:
     """Get a default port using CI allocation if available, otherwise use 8000."""
@@ -571,6 +577,9 @@ def stress_test(config,
 
     # For DeepSeek-V3 or DeepSeek-R1 specific server parameters
     if "DeepSeek-V3" in config.model_dir or "DeepSeek-R1" in config.model_dir:
+        # Reduce CUDA allocator fragmentation so transient MoE workspace
+        # allocations don't OOM when KV cache reservation is large.
+        os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
         test_server_config = ServerConfig(
             port=test_server_config.port,
             host=test_server_config.host,
@@ -582,7 +591,7 @@ def stress_test(config,
             max_num_tokens=
             8192,  # DeepSeek-V3 or DeepSeek-R1 specific max_num_tokens
             kv_cache_free_gpu_memory_fraction=
-            0.85,  # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
+            0.75,  # DeepSeek-V3 or DeepSeek-R1 specific kv_cache fraction
             capacity_scheduler_policy=test_server_config.
             capacity_scheduler_policy,
             wait_interval=test_server_config.wait_interval,
@@ -954,6 +963,8 @@ def create_aiperf_command(model_name,
         str(request_count),
         "--concurrency",
         str(concurrency),
+        "--output-artifact-dir",
+        ARTIFACTS_DIR,
         # "--verbose",
     ]
 
@@ -1365,8 +1376,7 @@ def extract_stress_test_metrics(artifacts_dir=None, current_model=None):
     # For local testing, the artifacts are at
     # artifacts_dir = os.path.join(script_dir, "artifacts")
     if artifacts_dir is None:
-        script_dir = os.path.dirname(os.path.abspath(__file__))
-        artifacts_dir = os.path.join(script_dir, "..", "artifacts")
+        artifacts_dir = ARTIFACTS_DIR
 
     # Find all profile_export_aiperf.json files in the artifacts directory
     json_files = glob(os.path.join(artifacts_dir,