Fix ep deployment issues (#4084)

CUHKSZzxy · web-flow · commit a8d7cb0f5a97 · 2025-11-27T19:43:29.000+08:00
* fix for multi-node ep

* add deep_gemm jit dependencies

* update docker

* set deepep mode for cuda graph

* fix

* bring back gdrcopy

* move to envs

* fix cu13 build

* fix LD_LIBRARY_PATH

* separate GDRCopy build and install

* minor updates

* fix dir error

* optimize deepep moe check

* fix lint
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -43,7 +43,9 @@ COPY . /opt/lmdeploy
 WORKDIR /opt/lmdeploy
 
 RUN --mount=type=cache,target=/root/.cache \
-    docker/build.sh && \
+    docker/build.sh
+
+RUN --mount=type=cache,target=/root/.cache \
     docker/prepare_wheel.sh
 
 # Runtime image
@@ -67,6 +69,7 @@ COPY docker/install.sh /tmp/install.sh
 RUN --mount=type=cache,target=/root/.cache \
     --mount=type=cache,target=/wheels,from=builder,source=/wheels \
     --mount=type=cache,target=/nccl,from=builder,source=/nccl \
+    --mount=type=cache,target=/debs,from=builder,source=/debs \
     /tmp/install.sh
 
 # explicitly set ptxas path for triton
diff --git a/docker/install.sh b/docker/install.sh
@@ -1,45 +1,58 @@
 #!/bin/bash -ex
 
+# install system packages
 export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
 sed -i 's|http://archive.ubuntu.com|http://azure.archive.ubuntu.com|g' /etc/apt/sources.list
 apt-get update -y
 apt-get install -y --no-install-recommends \
-    tzdata wget curl ssh sudo git-core libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1
+    tzdata wget curl ssh sudo git-core vim libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1
 
 if [[ ${PYTHON_VERSION} != "3.10" ]]; then
     apt-get install -y --no-install-recommends software-properties-common
     add-apt-repository -y ppa:deadsnakes/ppa
     apt-get update -y
 fi
 
+# install python, create virtual env
 apt-get install -y --no-install-recommends \
     python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv
 
 pushd /opt >/dev/null
     python${PYTHON_VERSION} -m venv py3
 popd >/dev/null
 
+# install CUDA build tools
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     apt-get install -y --no-install-recommends cuda-minimal-build-11-8
 elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-4
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-4 dkms
 elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-8
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-8 dkms
 elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-13-0
+    apt-get install -y --no-install-recommends cuda-minimal-build-13-0 dkms
 fi
 
 apt-get clean -y
 rm -rf /var/lib/apt/lists/*
 
+# install GDRCopy debs
+if [ -d "/debs" ] && [ "$(ls -A /debs/*.deb 2>/dev/null)" ]; then
+    dpkg -i /debs/*.deb
+fi
+
+# install python packages
 export PATH=/opt/py3/bin:$PATH
 
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     FA_VERSION=2.7.3
     TORCH_VERSION="<2.7"
+elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
+    FA_VERSION=2.8.3
+    TORCH_VERSION="==2.9.0"
 else
     FA_VERSION=2.8.3
-    TORCH_VERSION=""
+    # pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
+    TORCH_VERSION="==2.8.0"
 fi
 
 pip install -U pip wheel setuptools
@@ -50,13 +63,15 @@ elif [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
     pip install nvidia-nvshmem-cu12
 fi
 
-pip install /wheels/*.whl torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
+pip install torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
+pip install /wheels/*.whl
 
 
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]] && [[ "${PYTHON_VERSION}" != "3.9" ]]; then
-    pip install cuda-python dlblas
+    pip install cuda-python dlblas==0.0.6
 fi
 
+# install pre-compiled flash attention wheel
 PLATFORM="linux_x86_64"
 PY_VERSION=$(python3 - <<'PY'
 import torch, sys
diff --git a/docker/prepare_wheel.sh b/docker/prepare_wheel.sh
@@ -4,8 +4,11 @@ export PATH=/opt/py3/bin:$PATH
 
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     TORCH_VERSION="<2.7"
+elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
+    TORCH_VERSION="==2.9.0"
 else
-    TORCH_VERSION=""
+    # pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
+    TORCH_VERSION="==2.8.0"
 fi
 
 pip install "cmake<4.0" wheel ninja setuptools packaging
@@ -21,24 +24,35 @@ fi
 
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
 
-    if [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-        DEEP_GEMM_VERSION=03d0be3
-        FLASH_MLA_VERSION=9edee0c
-    else
-        DEEP_GEMM_VERSION=79f48ee
-        FLASH_MLA_VERSION=c759027
-    fi
-
-    DEEP_EP_VERSION=26cf250
+    GDRCOPY_VERSION=2.5.1
+    DEEP_EP_VERSION=9af0e0d  # v1.2.1
+    DEEP_GEMM_VERSION=c9f8b34  # v2.1.1.post3
+    FLASH_MLA_VERSION=1408756  # no release, pick the latest commit
 
+    # DeepEP
     if [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
         export CPLUS_INCLUDE_PATH="/usr/local/cuda/include/cccl":${CPLUS_INCLUDE_PATH}
         pip install nvidia-nvshmem-cu13
     else
         pip install nvidia-nvshmem-cu12
     fi
-
     pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepEP.git@${DEEP_EP_VERSION}"
-    pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"
+
+    # DeepGEMM
     pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepGEMM.git@${DEEP_GEMM_VERSION}"
+
+    # FlashMLA
+    # sm100 compilation for Flash MLA requires NVCC 12.9 or higher
+    FLASH_MLA_DISABLE_SM100=1 pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"
+
+    # GDRCopy debs
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends build-essential devscripts debhelper fakeroot pkg-config dkms
+
+    wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
+    && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
+    && cd gdrcopy-${GDRCOPY_VERSION}/packages \
+    && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+    && mkdir -p /debs \
+    && mv ./*.deb /debs/
 fi
diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -5,6 +5,7 @@
 import torch
 from torch.profiler import record_function
 
+from lmdeploy.pytorch.backends.deepep_moe_checker import moe_backend
 from lmdeploy.pytorch.backends.selector import get_backend
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
 from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager
@@ -250,6 +251,12 @@ def prepare_inputs_for_generation(
         context: StepContext = None,
     ):
         """Prepare inputs."""
+
+        if moe_backend.use_deepep_moe_backend():
+            from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode
+            deepep_mode = DeepEPMode.LOW_LATENCY if context.is_decoding else DeepEPMode.NORMAL
+            DeepEPBuffer.set_deepep_mode(deepep_mode)
+
         return self.model.prepare_inputs_for_generation(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
diff --git a/lmdeploy/pytorch/backends/cuda/moe.py b/lmdeploy/pytorch/backends/cuda/moe.py
@@ -6,6 +6,7 @@
 import torch
 import torch.distributed as dist
 
+from lmdeploy.pytorch.backends.deepep_moe_checker import moe_backend
 from lmdeploy.pytorch.distributed import get_dist_manager
 from lmdeploy.pytorch.kernels.cuda import fused_moe, fused_moe_w8a8
 from lmdeploy.pytorch.kernels.cuda.blocked_fp8_fused_moe import fused_moe_blocked_fp8
@@ -475,6 +476,12 @@ def __init__(self,
             self.use_deep_gemm = False
             logger.warning('For higher performance, please install DeepGEMM https://github.com/deepseek-ai/DeepGEMM')
 
+        try:
+            from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode, use_deepep  # noqa: F401
+            moe_backend.set_deepep_moe_backend()
+        except ImportError:
+            logger.warning('For higher performance, please install DeepEP https://github.com/deepseek-ai/DeepEP')
+
         # pre-allocate buffer
         self.fusedmoe_build(True)
 
diff --git a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
@@ -1,6 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 try:
     from deep_ep import Buffer
+
+    from lmdeploy.pytorch.envs import deep_ep_buffer_num_sms
+
+    Buffer.set_num_sms(deep_ep_buffer_num_sms)
     use_deepep = True
 except ImportError:
     use_deepep = False
diff --git a/lmdeploy/pytorch/backends/deepep_moe_checker.py b/lmdeploy/pytorch/backends/deepep_moe_checker.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+class MoEBackend:
+
+    def __init__(self):
+        """Initialize moe backend."""
+        self._use_deepep_moe_backend = False
+
+    def set_deepep_moe_backend(self):
+        """Set deepep moe backend."""
+        self._use_deepep_moe_backend = True
+
+    def use_deepep_moe_backend(self):
+        """Get deepep moe backend."""
+        return self._use_deepep_moe_backend
+
+
+MOE_BACKEND = None
+
+
+def get_moe_backend():
+    global MOE_BACKEND
+    if MOE_BACKEND is None:
+        MOE_BACKEND = MoEBackend()
+
+    return MOE_BACKEND
+
+
+moe_backend = get_moe_backend()
diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
@@ -127,6 +127,9 @@ def _patched_get_env(
     # If Ray is launched from outside, it may fail to access the environment variables.
     os.getenv('DEEPEP_MAX_BATCH_SIZE', None)
 
+    # deepep
+    deep_ep_buffer_num_sms = env_to_int('DEEPEP_BUFFER_NUM_SMS', 20)
+
     # deepgemm
     os.getenv('DG_JIT_DEBUG', '0')
     os.getenv('DG_JIT_PRINT_COMPILER_COMMAND', '0')