Add ROCm support: installation guide and FlashAttention compatibility for AMD GPUs (#3925)

Vivicai1005 · web-flow · commit 0662cfaa9359 · 2025-09-09T16:08:11.000+08:00
* add runtime for rocm

* add runtime for rocm

* add requirements for rocm

* edit block size for rocm

* edit requirements

* add *.tx

* rm index-url

* add ROCm-safe tiling to avoid LDS OOR

* treat ROCm as bf16-supported

* add installation process for rocm

* add installation process for rocm

* add *.txt

* use official lmdeploy URL and simplify ROCm installation guide with one-liner command

* style: apply pre-commit fixes in flashattention.py

* chore: apply pre-commit auto fixes (trailing whitespace, eof, quotes, line endings)

* remove pynvml
diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md
@@ -55,3 +55,24 @@ pip install https://github.com/InternLM/lmdeploy/archive/refs/tags/v0.10.0.zip
 ```
 
 If you want to build LMDeploy with support for Ascend, Cambricon, or MACA, install LMDeploy with the corresponding `LMDEPLOY_TARGET_DEVICE` environment variable.
+
+LMDeploy also supports installation on AMD GPUs with ROCm.
+
+```shell
+#The recommended way is to use the official ROCm PyTorch Docker image with pre-installed dependencies:
+docker run -it \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --device=/dev/kfd \
+    --device=/dev/dri \
+    --group-add video \
+    --ipc=host \
+    --network=host \
+    --shm-size 32G \
+    -v /root:/workspace \
+    rocm/pytorch:latest
+
+
+#Once inside the container, install LMDeploy with ROCm support:
+LMDEPLOY_TARGET_DEVICE=rocm pip install  git+https://github.com/InternLM/lmdeploy.git
+```
diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md
@@ -55,3 +55,24 @@ pip install https://github.com/InternLM/lmdeploy/archive/refs/tags/v0.10.0.zip
 ```
 
 如果您希望构建支持昇腾、寒武纪或沐熙的 LMDeploy，请使用相应的 `LMDEPLOY_TARGET_DEVICE` 环境变量进行安装。
+
+LMDeploy 也支持在 AMD GPU 的 ROCm 环境中安装。
+
+```shell
+#The recommended way is to use the official ROCm PyTorch Docker image with pre-installed dependencies:
+docker run -it \
+    --cap-add=SYS_PTRACE \
+    --security-opt seccomp=unconfined \
+    --device=/dev/kfd \
+    --device=/dev/dri \
+    --group-add video \
+    --ipc=host \
+    --network=host \
+    --shm-size 32G \
+    -v /root:/workspace \
+    rocm/pytorch:latest
+
+
+#Once inside the container, install LMDeploy with ROCm support:
+LMDEPLOY_TARGET_DEVICE=rocm pip install  git+https://github.com/InternLM/lmdeploy.git
+```
diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py
@@ -425,6 +425,14 @@ def _kernel_meta_sm12x(BLOCK_DK: int, shared_kv: bool):
     return BLOCK_M, BLOCK_N, num_warps, num_stages
 
 
+def _kernel_meta_rocm(BLOCK_DK: int, shared_kv: bool):
+    BLOCK_N = 32
+    BLOCK_M = 32 if BLOCK_DK > 128 else 64
+    num_warps = 4
+    num_stages = 1
+    return BLOCK_M, BLOCK_N, num_warps, num_stages
+
+
 def flash_attention_fwd(
     q_states: Tensor,
     k_states: Tensor,
@@ -491,17 +499,21 @@ def grid(args):
     shared_kv = k_states.data_ptr() == v_states.data_ptr() and BLOCK_DK == BLOCK_DV
 
     num_warps = 4
-    if _nv_cap[0] < 8:
-        BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm7x(BLOCK_DK)
-    elif _nv_cap[0] < 9:
-        if _nv_cap[1] in [6, 9]:
-            BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm86(BLOCK_DK, shared_kv)
-        else:
-            BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm8x(BLOCK_DK, shared_kv)
-    elif _nv_cap[0] < 10:
-        BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm9x(BLOCK_DK, shared_kv)
+    hip_mode = getattr(torch.version, 'hip', None) is not None
+    if hip_mode:
+        BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_rocm(BLOCK_DK, shared_kv)
     else:
-        BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm12x(BLOCK_DK, shared_kv)
+        if _nv_cap[0] < 8:
+            BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm7x(BLOCK_DK)
+        elif _nv_cap[0] < 9:
+            if _nv_cap[1] in [6, 9]:
+                BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm86(BLOCK_DK, shared_kv)
+            else:
+                BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm8x(BLOCK_DK, shared_kv)
+        elif _nv_cap[0] < 10:
+            BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm9x(BLOCK_DK, shared_kv)
+        else:
+            BLOCK_M, BLOCK_N, num_warps, num_stages = _kernel_meta_sm12x(BLOCK_DK, shared_kv)
 
     BLOCK_M = min(128, BLOCK_M)
     _flash_prefill_fwd_kernel[grid](
diff --git a/lmdeploy/utils.py b/lmdeploy/utils.py
@@ -389,6 +389,8 @@ def is_bf16_supported(device_type: str = 'cuda'):
         return True
     elif device_type == 'camb':
         return True
+    elif device_type == 'rocm':
+        return True
     else:
         return False
 
diff --git a/requirements/runtime_rocm.txt b/requirements/runtime_rocm.txt
@@ -0,0 +1,21 @@
+accelerate>=0.29.3
+einops
+fastapi
+fire
+mmengine-lite
+numpy<2.0.0
+openai
+outlines
+partial_json_parser
+peft<=0.14.0
+pillow
+protobuf
+pydantic>2.0.0
+pyzmq
+ray
+safetensors
+sentencepiece
+shortuuid
+tiktoken
+transformers
+uvicorn
diff --git a/requirements_rocm.txt b/requirements_rocm.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/runtime_rocm.txt
+-r requirements/lite.txt
+-r requirements/serve.txt