Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion tensorrt_llm/_torch/pyexecutor/scheduler/scheduler_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,19 @@ def _schedule_loop(self, active_requests, inflight_request_ids):

req_state_value = req.state_value

# Disagg gen init bypasses normal state gating (same as C++ / V1 scheduler)
# Disagg gen init bypasses normal state gating (same as C++ / V1 scheduler),
# but the V2 scheduler owns inline KV allocation so we must allocate here.
# V1 defers allocation to prepare_resources; V2 prepare_resources is a no-op
# for the primary manager, so allocation must happen in the scheduling loop.
if req_state_value == self._disagg_gen_init_state_value:
if not self.kv_cache_manager.prepare_context(req):
req_it += 1
continue
if not self.kv_cache_manager.resize_context(
req, req.context_remaining_length + get_draft_token_length(req)
):
req_it += 1
continue
disagg_candidates.append(req)
req_it += 1
continue
Expand Down
135 changes: 135 additions & 0 deletions tests/integration/defs/accuracy/test_disaggregated_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,50 @@ def test_auto_dtype(self, ctx_disable_overlap_scheduler,
self.MODEL_PATH) as llm:
run_accuracy_test(llm, self.MODEL_NAME, ["MMLU", "GSM8K"])

@skip_pre_hopper
@pytest.mark.skip_less_device(2)
def test_kv_cache_v2_nixl_python(self):
"""Test with use_kv_cache_manager_v2=True, block_reuse=False, backend=NIXL, transceiver_runtime=PYTHON"""
ctx_server_config = {
"disable_overlap_scheduler": True,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
},
"cache_transceiver_config": {
"backend": "NIXL",
"transceiver_runtime": "PYTHON"
}
}
gen_server_config = {
"disable_overlap_scheduler": False,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
},
"cache_transceiver_config": {
"backend": "NIXL",
"transceiver_runtime": "PYTHON"
}
}
disaggregated_server_config = {
"hostname": "localhost",
"port": 8000,
"backend": "pytorch",
"context_servers": {
"num_instances": 1,
"urls": ["localhost:8001"]
},
"generation_servers": {
"num_instances": 1,
"urls": ["localhost:8002"]
}
}
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config, gen_server_config,
self.MODEL_PATH) as llm:
run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"])

@pytest.mark.skip_less_device(2)
def test_ngram(self):
speculative_decoding_config = {
Expand Down Expand Up @@ -1145,6 +1189,51 @@ def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker):
self.MODEL_PATH) as llm:
run_accuracy_test(llm, self.MODEL_NAME, ["JsonModeEval"])

@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(60000)
@skip_pre_hopper
def test_kv_cache_v2_nixl_python(self):
"""Test with use_kv_cache_manager_v2=True, block_reuse=False, backend=NIXL, transceiver_runtime=PYTHON"""
ctx_server_config = {
"disable_overlap_scheduler": True,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
},
"cache_transceiver_config": {
"backend": "NIXL",
"transceiver_runtime": "PYTHON"
}
}
gen_server_config = {
"disable_overlap_scheduler": True,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
},
"cache_transceiver_config": {
"backend": "NIXL",
"transceiver_runtime": "PYTHON"
}
}
disaggregated_server_config = {
"hostname": "localhost",
"port": 8000,
"backend": "pytorch",
"context_servers": {
"num_instances": 1,
"urls": ["localhost:8001"]
},
"generation_servers": {
"num_instances": 1,
"urls": ["localhost:8002"]
}
}
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config, gen_server_config,
self.MODEL_PATH) as llm:
run_accuracy_test(llm, self.MODEL_NAME, ["GSM8K"])


@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
Expand Down Expand Up @@ -1197,6 +1286,52 @@ def test_auto_dtype(self, block_reuse):
self.MODEL_PATH) as llm:
run_accuracy_test(llm, self.MODEL_NAME, ["MMLU", "GSM8K"])

@pytest.mark.skip_less_device(2)
@skip_pre_hopper
def test_kv_cache_v2_nixl_python(self):
"""Test with use_kv_cache_manager_v2=True, block_reuse=False, backend=NIXL, transceiver_runtime=PYTHON"""
ctx_server_config = {
"disable_overlap_scheduler": True,
"cuda_graph_config": None,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
},
"cache_transceiver_config": {
"backend": "NIXL",
"transceiver_runtime": "PYTHON"
}
}
gen_server_config = {
"disable_overlap_scheduler": True,
"cuda_graph_config": None,
"kv_cache_config": {
"enable_block_reuse": False,
"use_kv_cache_manager_v2": True
},
"cache_transceiver_config": {
"backend": "NIXL",
"transceiver_runtime": "PYTHON"
}
}
disaggregated_server_config = {
"hostname": "localhost",
"port": 8000,
"backend": "pytorch",
"context_servers": {
"num_instances": 1,
"urls": ["localhost:8001"]
},
"generation_servers": {
"num_instances": 1,
"urls": ["localhost:8002"]
}
}
with launch_disaggregated_llm(disaggregated_server_config,
ctx_server_config, gen_server_config,
self.MODEL_PATH) as llm:
run_accuracy_test(llm, self.MODEL_NAME, ["MMLU", "GSM8K"])


@skip_pre_blackwell
@pytest.mark.skip_less_device_memory(80000)
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_b300.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ l0_dgx_b300:
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_kv_cache_v2_nixl_python
Comment thread
Shixiaowei02 marked this conversation as resolved.
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_kv_cache_v2_nixl_python
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_kv_cache_v2_nixl_python
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] TIMEOUT (180)
- condition:
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_lists/test-db/l0_dgx_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,12 @@ l0_dgx_h100:
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_kv_cache_v2_nixl_python
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_kv_cache_v2_nixl_python
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_kv_cache_v2_nixl_python
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False-False]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False-True]
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-True-False]
Expand Down
Loading