Skip to content

Commit a8d7cb0

Browse files
authored
Fix ep deployment issues (#4084)
* fix for multi-node ep * add deep_gemm jit dependencies * update docker * set deepep mode for cuda graph * fix * bring back gdrcopy * move to envs * fix cu13 build * fix LD_LIBRARY_PATH * separate GDRCopy build and install * minor updates * fix dir error * optimize deepep moe check * fix lint
1 parent 3d7d2d8 commit a8d7cb0

File tree

8 files changed

+101
-20
lines changed

8 files changed

+101
-20
lines changed

docker/Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ COPY . /opt/lmdeploy
4343
WORKDIR /opt/lmdeploy
4444

4545
RUN --mount=type=cache,target=/root/.cache \
46-
docker/build.sh && \
46+
docker/build.sh
47+
48+
RUN --mount=type=cache,target=/root/.cache \
4749
docker/prepare_wheel.sh
4850

4951
# Runtime image
@@ -67,6 +69,7 @@ COPY docker/install.sh /tmp/install.sh
6769
RUN --mount=type=cache,target=/root/.cache \
6870
--mount=type=cache,target=/wheels,from=builder,source=/wheels \
6971
--mount=type=cache,target=/nccl,from=builder,source=/nccl \
72+
--mount=type=cache,target=/debs,from=builder,source=/debs \
7073
/tmp/install.sh
7174

7275
# explicitly set ptxas path for triton

docker/install.sh

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,58 @@
11
#!/bin/bash -ex
22

3+
# install system packages
34
export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
45
sed -i 's|http://archive.ubuntu.com|http://azure.archive.ubuntu.com|g' /etc/apt/sources.list
56
apt-get update -y
67
apt-get install -y --no-install-recommends \
7-
tzdata wget curl ssh sudo git-core libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1
8+
tzdata wget curl ssh sudo git-core vim libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1
89

910
if [[ ${PYTHON_VERSION} != "3.10" ]]; then
1011
apt-get install -y --no-install-recommends software-properties-common
1112
add-apt-repository -y ppa:deadsnakes/ppa
1213
apt-get update -y
1314
fi
1415

16+
# install python, create virtual env
1517
apt-get install -y --no-install-recommends \
1618
python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv
1719

1820
pushd /opt >/dev/null
1921
python${PYTHON_VERSION} -m venv py3
2022
popd >/dev/null
2123

24+
# install CUDA build tools
2225
if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
2326
apt-get install -y --no-install-recommends cuda-minimal-build-11-8
2427
elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
25-
apt-get install -y --no-install-recommends cuda-minimal-build-12-4
28+
apt-get install -y --no-install-recommends cuda-minimal-build-12-4 dkms
2629
elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
27-
apt-get install -y --no-install-recommends cuda-minimal-build-12-8
30+
apt-get install -y --no-install-recommends cuda-minimal-build-12-8 dkms
2831
elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
29-
apt-get install -y --no-install-recommends cuda-minimal-build-13-0
32+
apt-get install -y --no-install-recommends cuda-minimal-build-13-0 dkms
3033
fi
3134

3235
apt-get clean -y
3336
rm -rf /var/lib/apt/lists/*
3437

38+
# install GDRCopy debs
39+
if [ -d "/debs" ] && [ "$(ls -A /debs/*.deb 2>/dev/null)" ]; then
40+
dpkg -i /debs/*.deb
41+
fi
42+
43+
# install python packages
3544
export PATH=/opt/py3/bin:$PATH
3645

3746
if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
3847
FA_VERSION=2.7.3
3948
TORCH_VERSION="<2.7"
49+
elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
50+
FA_VERSION=2.8.3
51+
TORCH_VERSION="==2.9.0"
4052
else
4153
FA_VERSION=2.8.3
42-
TORCH_VERSION=""
54+
# pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
55+
TORCH_VERSION="==2.8.0"
4356
fi
4457

4558
pip install -U pip wheel setuptools
@@ -50,13 +63,15 @@ elif [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
5063
pip install nvidia-nvshmem-cu12
5164
fi
5265

53-
pip install /wheels/*.whl torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
66+
pip install torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
67+
pip install /wheels/*.whl
5468

5569

5670
if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]] && [[ "${PYTHON_VERSION}" != "3.9" ]]; then
57-
pip install cuda-python dlblas
71+
pip install cuda-python dlblas==0.0.6
5872
fi
5973

74+
# install pre-compiled flash attention wheel
6075
PLATFORM="linux_x86_64"
6176
PY_VERSION=$(python3 - <<'PY'
6277
import torch, sys

docker/prepare_wheel.sh

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@ export PATH=/opt/py3/bin:$PATH
44

55
if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
66
TORCH_VERSION="<2.7"
7+
elif [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
8+
TORCH_VERSION="==2.9.0"
79
else
8-
TORCH_VERSION=""
10+
# pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
11+
TORCH_VERSION="==2.8.0"
912
fi
1013

1114
pip install "cmake<4.0" wheel ninja setuptools packaging
@@ -21,24 +24,35 @@ fi
2124

2225
if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
2326

24-
if [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
25-
DEEP_GEMM_VERSION=03d0be3
26-
FLASH_MLA_VERSION=9edee0c
27-
else
28-
DEEP_GEMM_VERSION=79f48ee
29-
FLASH_MLA_VERSION=c759027
30-
fi
31-
32-
DEEP_EP_VERSION=26cf250
27+
GDRCOPY_VERSION=2.5.1
28+
DEEP_EP_VERSION=9af0e0d # v1.2.1
29+
DEEP_GEMM_VERSION=c9f8b34 # v2.1.1.post3
30+
FLASH_MLA_VERSION=1408756 # no release, pick the latest commit
3331

32+
# DeepEP
3433
if [[ "${CUDA_VERSION_SHORT}" = "cu130" ]]; then
3534
export CPLUS_INCLUDE_PATH="/usr/local/cuda/include/cccl":${CPLUS_INCLUDE_PATH}
3635
pip install nvidia-nvshmem-cu13
3736
else
3837
pip install nvidia-nvshmem-cu12
3938
fi
40-
4139
pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepEP.git@${DEEP_EP_VERSION}"
42-
pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"
40+
41+
# DeepGEMM
4342
pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepGEMM.git@${DEEP_GEMM_VERSION}"
43+
44+
# FlashMLA
45+
# sm100 compilation for Flash MLA requires NVCC 12.9 or higher
46+
FLASH_MLA_DISABLE_SM100=1 pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"
47+
48+
# GDRCopy debs
49+
apt-get update -y \
50+
&& apt-get install -y --no-install-recommends build-essential devscripts debhelper fakeroot pkg-config dkms
51+
52+
wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
53+
&& tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
54+
&& cd gdrcopy-${GDRCOPY_VERSION}/packages \
55+
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
56+
&& mkdir -p /debs \
57+
&& mv ./*.deb /debs/
4458
fi

lmdeploy/pytorch/backends/cuda/graph_runner.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import torch
66
from torch.profiler import record_function
77

8+
from lmdeploy.pytorch.backends.deepep_moe_checker import moe_backend
89
from lmdeploy.pytorch.backends.selector import get_backend
910
from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
1011
from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager
@@ -250,6 +251,12 @@ def prepare_inputs_for_generation(
250251
context: StepContext = None,
251252
):
252253
"""Prepare inputs."""
254+
255+
if moe_backend.use_deepep_moe_backend():
256+
from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode
257+
deepep_mode = DeepEPMode.LOW_LATENCY if context.is_decoding else DeepEPMode.NORMAL
258+
DeepEPBuffer.set_deepep_mode(deepep_mode)
259+
253260
return self.model.prepare_inputs_for_generation(
254261
past_key_values=past_key_values,
255262
inputs_embeds=inputs_embeds,

lmdeploy/pytorch/backends/cuda/moe.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import torch
77
import torch.distributed as dist
88

9+
from lmdeploy.pytorch.backends.deepep_moe_checker import moe_backend
910
from lmdeploy.pytorch.distributed import get_dist_manager
1011
from lmdeploy.pytorch.kernels.cuda import fused_moe, fused_moe_w8a8
1112
from lmdeploy.pytorch.kernels.cuda.blocked_fp8_fused_moe import fused_moe_blocked_fp8
@@ -475,6 +476,12 @@ def __init__(self,
475476
self.use_deep_gemm = False
476477
logger.warning('For higher performance, please install DeepGEMM https://github.com/deepseek-ai/DeepGEMM')
477478

479+
try:
480+
from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode, use_deepep # noqa: F401
481+
moe_backend.set_deepep_moe_backend()
482+
except ImportError:
483+
logger.warning('For higher performance, please install DeepEP https://github.com/deepseek-ai/DeepEP')
484+
478485
# pre-allocate buffer
479486
self.fusedmoe_build(True)
480487

lmdeploy/pytorch/backends/cuda/token_dispatcher.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# Copyright (c) OpenMMLab. All rights reserved.
22
try:
33
from deep_ep import Buffer
4+
5+
from lmdeploy.pytorch.envs import deep_ep_buffer_num_sms
6+
7+
Buffer.set_num_sms(deep_ep_buffer_num_sms)
48
use_deepep = True
59
except ImportError:
610
use_deepep = False
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) OpenMMLab. All rights reserved.
2+
class MoEBackend:
3+
4+
def __init__(self):
5+
"""Initialize moe backend."""
6+
self._use_deepep_moe_backend = False
7+
8+
def set_deepep_moe_backend(self):
9+
"""Set deepep moe backend."""
10+
self._use_deepep_moe_backend = True
11+
12+
def use_deepep_moe_backend(self):
13+
"""Get deepep moe backend."""
14+
return self._use_deepep_moe_backend
15+
16+
17+
MOE_BACKEND = None
18+
19+
20+
def get_moe_backend():
21+
global MOE_BACKEND
22+
if MOE_BACKEND is None:
23+
MOE_BACKEND = MoEBackend()
24+
25+
return MOE_BACKEND
26+
27+
28+
moe_backend = get_moe_backend()

lmdeploy/pytorch/envs.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,9 @@ def _patched_get_env(
127127
# If Ray is launched from outside, it may fail to access the environment variables.
128128
os.getenv('DEEPEP_MAX_BATCH_SIZE', None)
129129

130+
# deepep
131+
deep_ep_buffer_num_sms = env_to_int('DEEPEP_BUFFER_NUM_SMS', 20)
132+
130133
# deepgemm
131134
os.getenv('DG_JIT_DEBUG', '0')
132135
os.getenv('DG_JIT_PRINT_COMPILER_COMMAND', '0')

0 commit comments

Comments
 (0)