Skip to content

Commit 3f5e425

Browse files
committed
Isolate ray tests to avoid GCS timeout in one pytest session
Signed-off-by: shuyixiong <[email protected]>
1 parent 95204b7 commit 3f5e425

4 files changed

Lines changed: 18 additions & 6 deletions

File tree

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,10 @@ l0_dgx_b200:
9999
orchestrator: ray
100100
tests:
101101
- unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu4"
102-
- unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py
102+
- unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py -m "part0"
103+
- unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py -m "part1"
104+
- unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py -m "part2"
105+
- unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py -m "part3"
103106
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
104107
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
105108
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp2_genpp2[TinyLlama-1.1B-Chat-v1.0]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,11 @@ l0_dgx_h100:
319319
orchestrator: ray
320320
tests:
321321
- unittest/_torch/ray_orchestrator/multi_gpu -m "gpu2"
322+
- unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py -m "part0"
323+
- unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py -m "part1"
324+
- unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py -m "part2"
325+
- unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py -m "part3"
326+
- unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py -m "part4"
322327
- unittest/llmapi/test_llm_multi_gpu_pytorch.py -m "gpu2"
323328
- unittest/llmapi/test_async_llm.py -m "gpu2"
324329
- accuracy/test_llm_api_pytorch_ray.py::TestLlama3_1_8BInstruct::test_pp2_ray

tests/unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from tensorrt_llm.llmapi import KvCacheConfig, SamplingParams
2626

2727

28+
@pytest.mark.part0
2829
@skip_pre_blackwell
2930
@pytest.mark.parametrize(
3031
"model_dir, fp8_model_dir",
@@ -85,6 +86,7 @@ def test_llm_update_weights_fp8(model_dir, fp8_model_dir):
8586
compare_logits(llm_logits, ref_logits)
8687

8788

89+
@pytest.mark.part1
8890
@skip_pre_blackwell
8991
@pytest.mark.parametrize(
9092
"model_dir, fp8_model_dir",
@@ -434,6 +436,7 @@ def get_weight_ipc_handles_serialized(
434436
return ret
435437

436438

439+
@pytest.mark.part2
437440
@skip_pre_blackwell
438441
@pytest.mark.parametrize(
439442
"model_dir",
@@ -496,6 +499,7 @@ def test_llm_update_weights_nvfp4(model_dir, kv_cache_dtype):
496499
compare_logits(llm_logits, ref_logits, threshold=0.8)
497500

498501

502+
@pytest.mark.part3
499503
@skip_pre_blackwell
500504
@pytest.mark.parametrize(
501505
"model_dir",

tests/unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def run(self, pg_op_name: str, test_tensor: torch.Tensor,
8686
return True
8787

8888

89-
@pytest.mark.gpu2
89+
@pytest.mark.part0
9090
@pytest.mark.parametrize("hidden_size", [128, 1024],
9191
ids=lambda x: f"hidden:{x}")
9292
@pytest.mark.parametrize("seq_len", [16, 64], ids=lambda x: f"seqlen:{x}")
@@ -148,7 +148,7 @@ def test_allgather_pg_op(setup_ray_cluster, seq_len, hidden_size, var_len):
148148
assert r is True
149149

150150

151-
@pytest.mark.gpu2
151+
@pytest.mark.part1
152152
@pytest.mark.parametrize("hidden_size", [128, 1024],
153153
ids=lambda x: f"hidden:{x}")
154154
@pytest.mark.parametrize("seq_len", [16, 64], ids=lambda x: f"seqlen:{x}")
@@ -208,7 +208,7 @@ def test_reducescatter_pg_op(setup_ray_cluster, seq_len, hidden_size, var_len):
208208
assert r is True
209209

210210

211-
@pytest.mark.gpu2
211+
@pytest.mark.part2
212212
@pytest.mark.parametrize("hidden_size", [128, 1024],
213213
ids=lambda x: f"hidden:{x}")
214214
@pytest.mark.parametrize("seq_len", [16, 64], ids=lambda x: f"seqlen:{x}")
@@ -353,7 +353,7 @@ def run_tp_cp_broadcast(self, root_obj, root: int = 0):
353353
return result == root_obj
354354

355355

356-
@pytest.mark.gpu2
356+
@pytest.mark.part3
357357
@pytest.mark.parametrize("hidden_size", [128, 512], ids=lambda x: f"hidden:{x}")
358358
@pytest.mark.parametrize("seq_len", [16, 32], ids=lambda x: f"seqlen:{x}")
359359
def test_cp_broadcast_tensor(setup_ray_cluster, seq_len, hidden_size):
@@ -394,7 +394,7 @@ def test_cp_broadcast_tensor(setup_ray_cluster, seq_len, hidden_size):
394394
assert r is True, "Tensor broadcast from root=0 failed"
395395

396396

397-
@pytest.mark.gpu2
397+
@pytest.mark.part4
398398
@pytest.mark.parametrize("test_object", [
399399
{
400400
"key1": "value1",

0 commit comments

Comments
 (0)