bugfix: fix initialization error for mooncake in k8s (vllm-project#2541)

zzy-ContiLearn · gemini-code-assist[bot] · LCAIZJ · yangxiaojun0126 · commit 9af9207392d0 · 2025-09-18T14:50:06.000+08:00
### What this PR does / why we need it? The detail has been clarified in that issue : vllm-project#2557 ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? easy to test beacause we just need to echo the variable - vLLM version: v0.10.1.1 - vLLM main: vllm-project/vllm@6997a25 --------- Signed-off-by: zzy-ContiLearn <1831242919@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: LCAIZJ <leichao139636@163.com>
diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md
@@ -32,6 +32,7 @@ export GLOO_SOCKET_IFNAME="xxxxxx"
 export TP_SOCKET_IFNAME="xxxxxx"
 export HCCL_SOCKET_IFNAME="xxxxxx"
 export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
+export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -)
 
 vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \
   --host localhost \
@@ -100,6 +101,7 @@ export GLOO_SOCKET_IFNAME="xxxxxx"
 export TP_SOCKET_IFNAME="xxxxxx"
 export HCCL_SOCKET_IFNAME="xxxxxx"
 export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
+export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -)
 
 vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \
   --host localhost \
diff --git a/tests/ut/kv_connector/test_mooncake_connector.py b/tests/ut/kv_connector/test_mooncake_connector.py
@@ -1094,6 +1094,7 @@ def register_memory(self, *args, **kwargs):
 
 class MockEnvsAscend:
     MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol"
+    PHYSICAL_DEVICES = "10,11"
 
 
 def mock_get_tensor_model_parallel_rank():
@@ -1122,7 +1123,7 @@ def setUp(self):
         self.mock_transfer_engine.register_memory.return_value = 0
 
         self.patches = [
-            patch('os.getenv', return_value="0,1"),
+            patch('os.getenv', return_value="10,11"),
             patch('torch.Tensor.size', return_value=(10, 16, 8, 16)),
             patch('torch.Tensor.element_size', return_value=4),
             patch('torch.Tensor.data_ptr', return_value=0x1000),
@@ -1191,6 +1192,12 @@ def test_register_kv_caches_mla_case(self):
         self.assertTrue(worker.use_mla)
         self.assertEqual(len(worker.block_len), 2)
 
+    def test_device_id_selection_with_physical_devices(self):
+        # Test with physical devices set
+        worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
+        # Default tp_rank is 0, so device_id should be 10
+        self.assertEqual(worker.device_id, 10)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py
@@ -2,7 +2,6 @@
 import contextlib
 import hashlib
 import math
-import os
 import queue
 import random
 import struct
@@ -29,6 +28,8 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.request import RequestStatus
 
+import vllm_ascend.envs as envs_ascend
+
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
@@ -758,13 +759,21 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         # get tp device id
         # TODO(kw): https://github.com/vllm-project/vllm-ascend/pull/940
         # introducing some changes
-        device_ids_str = os.getenv("ASCEND_RT_VISIBLE_DEVICES", None)
+        device_ids_str = envs_ascend.PHYSICAL_DEVICES
         if device_ids_str is None:
             device_ids = list(
                 range(self.dp_rank * self.tp_size,
                       (self.dp_rank + 1) * self.tp_size))
         else:
             device_ids = list(map(int, device_ids_str.split(',')))
+            start_index = self.dp_rank * self.tp_size
+            end_index = start_index + self.tp_size
+            if len(device_ids) < end_index:
+                raise ValueError(
+                    f"Not enough physical devices available for DP rank {self.dp_rank}. "
+                    f"Expected at least {end_index} devices, but found {len(device_ids)} "
+                    "in PHYSICAL_DEVICES.")
+            device_ids = device_ids[start_index:end_index]
         assert len(device_ids) > self.tp_rank  # type: ignore
         self.device_id = device_ids[self.tp_rank]  # type: ignore
 
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -135,6 +135,10 @@
     # this feature in eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
+    # Determine the number of physical devices in a non-full-use scenario
+    # caused by the initialization of the Mooncake connector.
+    "PHYSICAL_DEVICES":
+    lambda: os.getenv("PHYSICAL_DEVICES", None),
 }
 
 # end-env-vars-definition