Add warm up fix.

dominicshanshan · dominicshanshan · commit 558382f2cdc8 · 2026-03-12T20:03:42.000-07:00
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
@@ -349,6 +349,46 @@ def check_server_health(server_url: str,
         return False, f"Unexpected error during health check: {str(e)}"
 
 
+def warmup_inference(server_url: str,
+                     model_name: str,
+                     timeout: float = 300.0,
+                     num_warmup_requests: int = 2) -> bool:
+    """
+    Send a few lightweight completion requests to the server so the inference
+    pipeline (JIT, CUDA graphs, memory pools) is fully warmed up before
+    aiperf begins timing.  The /health endpoint only confirms the HTTP server
+    is up -- the first real inference can be orders of magnitude slower.
+
+    Returns True if warmup succeeded, False otherwise.
+    """
+    endpoint = f"{server_url}/v1/completions"
+    payload = {
+        "model": model_name,
+        "prompt": "Hello",
+        "max_tokens": 8,
+        "temperature": 0.0,
+    }
+    for i in range(num_warmup_requests):
+        try:
+            print_info(
+                f"Sending inference warmup request {i+1}/{num_warmup_requests}..."
+            )
+            resp = requests.post(endpoint, json=payload, timeout=timeout)
+            if resp.status_code == 200:
+                print_info(
+                    f"Warmup request {i+1}/{num_warmup_requests} succeeded")
+            else:
+                print_warning(
+                    f"Warmup request {i+1} returned status {resp.status_code}: "
+                    f"{resp.text[:200]}")
+                return False
+        except requests.RequestException as e:
+            print_warning(f"Warmup request {i+1} failed: {e}")
+            return False
+    print_info("Inference warmup complete")
+    return True
+
+
 def is_port_available(port: int,
                       host: str = "localhost") -> Tuple[bool, Optional[str]]:
     """
@@ -753,6 +793,14 @@ def stress_test(config,
             print_info(
                 f"Server is running with model {model_name}. Starting tests...")
 
+            # Warm up the inference pipeline before benchmarking.
+            # The /health endpoint only confirms the HTTP layer is up; the
+            # first real inference triggers JIT/CUDA-graph compilation that
+            # can take much longer than aiperf's per-request timeout.
+            if not warmup_inference(test_server_config.url, model_name):
+                print_warning("Inference warmup failed -- proceeding anyway, "
+                              "but aiperf may hit startup timeouts")
+
             # Run baseline accuracy test first if enabled
             baseline_accuracy_success = True
             if stress_config and stress_config.enable_accuracy_test:
@@ -852,7 +900,8 @@ def create_aiperf_command(model_name,
                           input_len_std=PerformanceParams.input_len_std,
                           output_len_mean=PerformanceParams.output_len_mean,
                           output_len_std=PerformanceParams.output_len_std,
-                          warmup_request_count=10):
+                          warmup_request_count=10,
+                          request_timeout_seconds=120.0):
     """
     Create a command list for aiperf with standardized parameters.
 
@@ -867,6 +916,7 @@ def create_aiperf_command(model_name,
         output_len_mean: Mean output length
         output_len_std: Standard deviation of output length
         warmup_request_count: Number of warmup requests
+        request_timeout_seconds: Per-request timeout in seconds for aiperf
 
     Returns:
         List of command-line arguments for aiperf
@@ -898,6 +948,8 @@ def create_aiperf_command(model_name,
         str(concurrency),
         "--warmup-request-count",
         str(warmup_request_count),
+        "--request-timeout-seconds",
+        str(request_timeout_seconds),
         # "--verbose",
     ]
 
@@ -910,6 +962,9 @@ def run_aiperf_process(cmd,
     """
     Run a aiperf process and monitor both the process and server health.
 
+    Captures stdout/stderr so that aiperf's error output is visible in the
+    pytest report when it exits with a non-zero code.
+
     Args:
         cmd: Command list to execute aiperf
         test_start_time: Start time of the test
@@ -920,29 +975,53 @@ def run_aiperf_process(cmd,
     Returns:
         Boolean indicating whether the process completed successfully
     """
-    # Start aiperf process with our context manager
-    with launch_process(cmd,
-                        start_new_session=True,
-                        filter_pattern=None,
-                        request_counter=request_counter) as process:
-        # Set monitoring parameters
+    stdout_lines = []
+    stderr_lines = []
+    stdout_lock = threading.Lock()
+    stderr_lock = threading.Lock()
+
+    def _capture_and_print(pipe, line_buffer, lock):
+        try:
+            for line in iter(pipe.readline, ''):
+                print(line, end='', flush=True)
+                with lock:
+                    line_buffer.append(line)
+        except (BrokenPipeError, IOError, ValueError):
+            pass
+
+    process = Popen(cmd,
+                    start_new_session=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    bufsize=1,
+                    universal_newlines=True)
+    print_info(f"Process started with PID: {process.pid}")
+
+    stdout_reader = threading.Thread(target=_capture_and_print,
+                                     args=(process.stdout, stdout_lines,
+                                           stdout_lock),
+                                     daemon=True)
+    stderr_reader = threading.Thread(target=_capture_and_print,
+                                     args=(process.stderr, stderr_lines,
+                                           stderr_lock),
+                                     daemon=True)
+    stdout_reader.start()
+    stderr_reader.start()
+
+    try:
         last_health_check = time.time()
         process_completed = False
 
-        # Monitor both the server and aiperf process
         while process.poll() is None:
             current_time = time.time()
 
-            # Check if aiperf is still running but exceeded timeout
             elapsed_time = current_time - test_start_time
             if elapsed_time > test_timeout:
                 cleanup_process_tree(process, has_session=True)
                 raise RuntimeError(
                     f"aiperf test timed out after {test_timeout} seconds")
 
-            # Check server health periodically
             if current_time - last_health_check > server_config.health_check_timeout:
-
                 is_healthy, error_msg = check_server_health(
                     server_config.url, server_config.health_check_timeout)
 
@@ -951,31 +1030,50 @@ def run_aiperf_process(cmd,
                         f"Server health check passed after {elapsed_time:.1f} seconds of test"
                     )
                 else:
-                    # Raise an exception to stop the test
                     print_warning(f"Server health check failed: {error_msg}")
                     cleanup_process_tree(process, has_session=True)
                     raise RuntimeError(
                         f"Server health check failed during test: {error_msg}")
 
-                # Update last health check time
                 last_health_check = current_time
 
             time.sleep(0.5)
 
-        # Check final status of aiperf process
+        stdout_reader.join(timeout=5)
+        stderr_reader.join(timeout=5)
+
         retcode = process.poll()
         if retcode is not None:
             if retcode != 0:
+                with stderr_lock:
+                    captured_stderr = ''.join(stderr_lines[-50:])
+                with stdout_lock:
+                    captured_stdout = ''.join(stdout_lines[-50:])
                 cleanup_process_tree(process, has_session=True)
                 raise RuntimeError(
-                    f"aiperf exited with non-zero code: {retcode}")
+                    f"aiperf exited with non-zero code: {retcode}\n"
+                    f"--- aiperf stdout (last 50 lines) ---\n"
+                    f"{captured_stdout}\n"
+                    f"--- aiperf stderr (last 50 lines) ---\n"
+                    f"{captured_stderr}")
             else:
                 print_info("aiperf completed successfully")
                 process_completed = True
         else:
             cleanup_process_tree(process, has_session=True)
             raise RuntimeError(
                 "aiperf did not complete normally, will terminate")
+    finally:
+        if process.poll() is None:
+            process.terminate()
+            try:
+                process.wait(timeout=GRACEFUL_TERMINATION_TIMEOUT)
+            except subprocess.TimeoutExpired:
+                cleanup_process_tree(process, has_session=True)
+        if process.stdout:
+            process.stdout.close()
+        if process.stderr:
+            process.stderr.close()
 
     return process_completed