fix(docker) rocm 6.3 based image (#8152)

ebr · web-flow · commit 4226b741b1c3 · 2025-07-29T10:16:42.000-04:00
1. Fix the run script to properly read the GPU_DRIVER
2. Cloned and adjusted the ROCM dockerbuild for docker
3. Adjust the docker-compose.yml to use the cloned dockerbuild
diff --git a/.github/workflows/typegen-checks.yml b/.github/workflows/typegen-checks.yml
@@ -39,6 +39,18 @@ jobs:
       - name: checkout
         uses: actions/checkout@v4
 
+      - name: Free up more disk space on the runner
+        # https://github.com/actions/runner-images/issues/2840#issuecomment-1284059930
+        run: |
+          echo "----- Free space before cleanup"
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+          sudo swapoff /mnt/swapfile
+          sudo rm -rf /mnt/swapfile
+          echo "----- Free space after cleanup"
+          df -h
+
       - name: check for changed files
         if: ${{ inputs.always_run != true }}
         id: changed-files
diff --git a/docker/.env.sample b/docker/.env.sample
@@ -22,6 +22,10 @@
 ## GPU_DRIVER can be set to either `cuda` or `rocm` to enable GPU support in the container accordingly.
 # GPU_DRIVER=cuda #| rocm
 
+## If you are using ROCM, you will need to ensure that the render group within the container and the host system use the same group ID.
+## To obtain the group ID of the render group on the host system, run `getent group render` and grab the number.
+# RENDER_GROUP_ID=
+
 ## CONTAINER_UID can be set to the UID of the user on the host system that should own the files in the container.
 ## It is usually not necessary to change this. Use `id -u` on the host system to find the UID.
 # CONTAINER_UID=1000
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -43,7 +43,6 @@ ENV \
     UV_MANAGED_PYTHON=1 \
     UV_LINK_MODE=copy \
     UV_PROJECT_ENVIRONMENT=/opt/venv \
-    UV_INDEX="https://download.pytorch.org/whl/cu124" \
     INVOKEAI_ROOT=/invokeai \
     INVOKEAI_HOST=0.0.0.0 \
     INVOKEAI_PORT=9090 \
@@ -74,19 +73,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=uv.lock,target=uv.lock \
     # this is just to get the package manager to recognize that the project exists, without making changes to the docker layer
     --mount=type=bind,source=invokeai/version,target=invokeai/version \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ] || [ "$GPU_DRIVER" = "cpu" ]; then UV_INDEX="https://download.pytorch.org/whl/cpu"; \
-    elif [ "$GPU_DRIVER" = "rocm" ]; then UV_INDEX="https://download.pytorch.org/whl/rocm6.2"; \
-    fi && \
-    uv sync --frozen
-
-# build patchmatch
-RUN cd /usr/lib/$(uname -p)-linux-gnu/pkgconfig/ && ln -sf opencv4.pc opencv.pc
-RUN python -c "from patchmatch import patch_match"
+    ulimit -n 30000 && \
+    uv sync --extra $GPU_DRIVER --frozen
 
 # Link amdgpu.ids for ROCm builds
 # contributed by https://github.com/Rubonnek
 RUN mkdir -p "/opt/amdgpu/share/libdrm" &&\
-    ln -s "/usr/share/libdrm/amdgpu.ids" "/opt/amdgpu/share/libdrm/amdgpu.ids"
+    ln -s "/usr/share/libdrm/amdgpu.ids" "/opt/amdgpu/share/libdrm/amdgpu.ids" && groupadd render
+
+# build patchmatch
+RUN cd /usr/lib/$(uname -p)-linux-gnu/pkgconfig/ && ln -sf opencv4.pc opencv.pc
+RUN python -c "from patchmatch import patch_match"
 
 RUN mkdir -p ${INVOKEAI_ROOT} && chown -R ${CONTAINER_UID}:${CONTAINER_GID} ${INVOKEAI_ROOT}
 
@@ -105,8 +102,6 @@ COPY invokeai ${INVOKEAI_SRC}/invokeai
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
     --mount=type=bind,source=uv.lock,target=uv.lock \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ] || [ "$GPU_DRIVER" = "cpu" ]; then UV_INDEX="https://download.pytorch.org/whl/cpu"; \
-    elif [ "$GPU_DRIVER" = "rocm" ]; then UV_INDEX="https://download.pytorch.org/whl/rocm6.2"; \
-    fi && \
-    uv pip install -e .
+    ulimit -n 30000 && \
+    uv pip install -e .[$GPU_DRIVER]
 
diff --git a/docker/Dockerfile-rocm-full b/docker/Dockerfile-rocm-full
@@ -0,0 +1,136 @@
+# syntax=docker/dockerfile:1.4
+
+#### Web UI ------------------------------------
+
+FROM docker.io/node:22-slim AS web-builder
+ENV PNPM_HOME="/pnpm"
+ENV PATH="$PNPM_HOME:$PATH"
+RUN corepack use pnpm@8.x
+RUN corepack enable
+
+WORKDIR /build
+COPY invokeai/frontend/web/ ./
+RUN --mount=type=cache,target=/pnpm/store \
+    pnpm install --frozen-lockfile
+RUN npx vite build
+
+## Backend ---------------------------------------
+
+FROM library/ubuntu:24.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt \
+    apt update && apt install -y --no-install-recommends \
+    ca-certificates \
+    git \
+    gosu \
+    libglib2.0-0 \
+    libgl1 \
+    libglx-mesa0 \
+    build-essential \
+    libopencv-dev \
+    libstdc++-10-dev \
+    wget
+
+ENV \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    VIRTUAL_ENV=/opt/venv \
+    INVOKEAI_SRC=/opt/invokeai \
+    PYTHON_VERSION=3.12 \
+    UV_PYTHON=3.12 \
+    UV_COMPILE_BYTECODE=1 \
+    UV_MANAGED_PYTHON=1 \
+    UV_LINK_MODE=copy \
+    UV_PROJECT_ENVIRONMENT=/opt/venv \
+    INVOKEAI_ROOT=/invokeai \
+    INVOKEAI_HOST=0.0.0.0 \
+    INVOKEAI_PORT=9090 \
+    PATH="/opt/venv/bin:$PATH" \
+    CONTAINER_UID=${CONTAINER_UID:-1000} \
+    CONTAINER_GID=${CONTAINER_GID:-1000}
+
+ARG GPU_DRIVER=cuda
+
+# Install `uv` for package management
+COPY --from=ghcr.io/astral-sh/uv:0.6.9 /uv /uvx /bin/
+
+# Install python & allow non-root user to use it by traversing the /root dir without read permissions
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv python install ${PYTHON_VERSION} && \
+    # chmod --recursive a+rX /root/.local/share/uv/python
+    chmod 711 /root
+
+WORKDIR ${INVOKEAI_SRC}
+
+# Install project's dependencies as a separate layer so they aren't rebuilt every commit.
+# bind-mount instead of copy to defer adding sources to the image until next layer.
+#
+# NOTE: there are no pytorch builds for arm64 + cuda, only cpu
+# x86_64/CUDA is the default
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    # this is just to get the package manager to recognize that the project exists, without making changes to the docker layer
+    --mount=type=bind,source=invokeai/version,target=invokeai/version \
+    ulimit -n 30000 && \
+    uv sync --extra $GPU_DRIVER --frozen
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt \
+    if [ "$GPU_DRIVER" = "rocm" ]; then \
+    wget -O /tmp/amdgpu-install.deb \
+    https://repo.radeon.com/amdgpu-install/6.3.4/ubuntu/noble/amdgpu-install_6.3.60304-1_all.deb && \
+    apt install -y /tmp/amdgpu-install.deb && \
+    apt update && \
+    amdgpu-install --usecase=rocm -y && \
+    apt-get autoclean && \
+    apt clean && \
+    rm -rf /tmp/* /var/tmp/* && \
+    usermod -a -G render ubuntu && \
+    usermod -a -G video ubuntu && \
+    echo "\\n/opt/rocm/lib\\n/opt/rocm/lib64" >> /etc/ld.so.conf.d/rocm.conf && \
+    ldconfig && \
+    update-alternatives --auto rocm; \
+    fi
+
+## Heathen711: Leaving this for review input, will remove before merge
+# RUN --mount=type=cache,target=/var/cache/apt \
+#     --mount=type=cache,target=/var/lib/apt \
+#     if [ "$GPU_DRIVER" = "rocm" ]; then \
+#     groupadd render && \
+#     usermod -a -G render ubuntu && \
+#     usermod -a -G video ubuntu; \
+#     fi
+
+## Link amdgpu.ids for ROCm builds
+## contributed by https://github.com/Rubonnek
+# RUN mkdir -p "/opt/amdgpu/share/libdrm" &&\
+#     ln -s "/usr/share/libdrm/amdgpu.ids" "/opt/amdgpu/share/libdrm/amdgpu.ids"
+
+# build patchmatch
+RUN cd /usr/lib/$(uname -p)-linux-gnu/pkgconfig/ && ln -sf opencv4.pc opencv.pc
+RUN python -c "from patchmatch import patch_match"
+
+RUN mkdir -p ${INVOKEAI_ROOT} && chown -R ${CONTAINER_UID}:${CONTAINER_GID} ${INVOKEAI_ROOT}
+
+COPY docker/docker-entrypoint.sh ./
+ENTRYPOINT ["/opt/invokeai/docker-entrypoint.sh"]
+CMD ["invokeai-web"]
+
+# --link requires buldkit w/ dockerfile syntax 1.4, does not work with podman
+COPY --link --from=web-builder /build/dist ${INVOKEAI_SRC}/invokeai/frontend/web/dist
+
+# add sources last to minimize image changes on code changes
+COPY invokeai ${INVOKEAI_SRC}/invokeai
+
+# this should not increase image size because we've already installed dependencies
+# in a previous layer
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
+    --mount=type=bind,source=uv.lock,target=uv.lock \
+    ulimit -n 30000 && \
+    uv pip install -e .[$GPU_DRIVER]
+
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -47,8 +47,9 @@ services:
 
   invokeai-rocm:
     <<: *invokeai
-    devices:
-      - /dev/kfd:/dev/kfd
-      - /dev/dri:/dev/dri
+    environment:
+      - AMD_VISIBLE_DEVICES=all
+      - RENDER_GROUP_ID=${RENDER_GROUP_ID}
+    runtime: amd
     profiles:
       - rocm
diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh
@@ -21,6 +21,17 @@ _=$(id ${USER} 2>&1) || useradd -u ${USER_ID} ${USER}
 # ensure the UID is correct
 usermod -u ${USER_ID} ${USER} 1>/dev/null
 
+## ROCM specific configuration
+# render group within the container must match the host render group
+# otherwise the container will not be able to access the host GPU.
+if [[ -v "RENDER_GROUP_ID" ]] && [[ ! -z "${RENDER_GROUP_ID}" ]]; then
+  # ensure the render group exists
+  groupmod -g ${RENDER_GROUP_ID} render
+  usermod -a -G render ${USER}
+  usermod -a -G video ${USER}
+fi
+
+
 ### Set the $PUBLIC_KEY env var to enable SSH access.
 # We do not install openssh-server in the image by default to avoid bloat.
 # but it is useful to have the full SSH server e.g. on Runpod.
diff --git a/docker/run.sh b/docker/run.sh
@@ -13,7 +13,7 @@ run() {
 
   # parse .env file for build args
   build_args=$(awk '$1 ~ /=[^$]/ && $0 !~ /^#/ {print "--build-arg " $0 " "}' .env) &&
-  profile="$(awk -F '=' '/GPU_DRIVER/ {print $2}' .env)"
+  profile="$(awk -F '=' '/GPU_DRIVER=/ {print $2}' .env)"
 
   # default to 'cuda' profile
   [[ -z "$profile" ]] && profile="cuda"
@@ -30,7 +30,7 @@ run() {
 
   printf "%s\n" "starting service $service_name"
   docker compose --profile "$profile" up -d "$service_name"
-  docker compose logs -f
+  docker compose --profile "$profile" logs -f
 }
 
 run
diff --git a/docs/installation/manual.md b/docs/installation/manual.md
@@ -69,34 +69,34 @@ The following commands vary depending on the version of Invoke being installed a
     - If you have an Nvidia 20xx series GPU or older, use `invokeai[xformers]`.
     - If you have an Nvidia 30xx series GPU or newer, or do not have an Nvidia GPU, use `invokeai`.
 
-7. Determine the `PyPI` index URL to use for installation, if any. This is necessary to get the right version of torch installed.
+7. Determine the torch backend to use for installation, if any. This is necessary to get the right version of torch installed. This is acheived by using [UV's built in torch support.](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection)
 
     === "Invoke v5.12 and later"
 
-        - If you are on Windows or Linux with an Nvidia GPU, use `https://download.pytorch.org/whl/cu128`.
-        - If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
-        - If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm6.2.4`.
-        - **In all other cases, do not use an index.**
+        - If you are on Windows or Linux with an Nvidia GPU, use `--torch-backend=cu128`.
+        - If you are on Linux with no GPU, use `--torch-backend=cpu`.
+        - If you are on Linux with an AMD GPU, use `--torch-backend=rocm6.3`.
+        - **In all other cases, do not use a torch backend.**
 
     === "Invoke v5.10.0 to v5.11.0"
 
-        - If you are on Windows or Linux with an Nvidia GPU, use `https://download.pytorch.org/whl/cu126`.
-        - If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
-        - If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm6.2.4`.
+        - If you are on Windows or Linux with an Nvidia GPU, use `--torch-backend=cu126`.
+        - If you are on Linux with no GPU, use `--torch-backend=cpu`.
+        - If you are on Linux with an AMD GPU, use `--torch-backend=rocm6.2.4`.
         - **In all other cases, do not use an index.**
 
     === "Invoke v5.0.0 to v5.9.1"
 
-        - If you are on Windows with an Nvidia GPU, use `https://download.pytorch.org/whl/cu124`.
-        - If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
-        - If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm6.1`.
+        - If you are on Windows with an Nvidia GPU, use `--torch-backend=cu124`.
+        - If you are on Linux with no GPU, use `--torch-backend=cpu`.
+        - If you are on Linux with an AMD GPU, use `--torch-backend=rocm6.1`.
         - **In all other cases, do not use an index.**
 
     === "Invoke v4"
 
-        - If you are on Windows with an Nvidia GPU, use `https://download.pytorch.org/whl/cu124`.
-        - If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
-        - If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm5.2`.
+        - If you are on Windows with an Nvidia GPU, use `--torch-backend=cu124`.
+        - If you are on Linux with no GPU, use `--torch-backend=cpu`.
+        - If you are on Linux with an AMD GPU, use `--torch-backend=rocm5.2`.
         - **In all other cases, do not use an index.**
 
 8. Install the `invokeai` package. Substitute the package specifier and version.
@@ -105,10 +105,10 @@ The following commands vary depending on the version of Invoke being installed a
     uv pip install <PACKAGE_SPECIFIER>==<VERSION> --python 3.12 --python-preference only-managed --force-reinstall
     ```
 
-    If you determined you needed to use a `PyPI` index URL in the previous step, you'll need to add `--index=<INDEX_URL>` like this:
+    If you determined you needed to use a torch backend in the previous step, you'll need to set the backend like this:
 
     ```sh
-    uv pip install <PACKAGE_SPECIFIER>==<VERSION> --python 3.12 --python-preference only-managed --index=<INDEX_URL> --force-reinstall
+    uv pip install <PACKAGE_SPECIFIER>==<VERSION> --python 3.12 --python-preference only-managed --torch-backend=<VERSION> --force-reinstall
     ```
 
 9. Deactivate and reactivate your venv so that the invokeai-specific commands become available in the environment:
diff --git a/pyproject.toml b/pyproject.toml
@@ -73,7 +73,7 @@ dependencies = [
   "pypatchmatch",
   "python-multipart",
   "requests",
-  "semver~=3.0.1"
+  "semver~=3.0.1",
 ]
 
 [project.optional-dependencies]
@@ -83,6 +83,14 @@ dependencies = [
   # torch 2.4+cu carries its own triton dependency
 ]
 
+"cpu" = ["torch==2.7.1+cpu", "torchvision==0.22.1+cpu"]
+"cuda" = ["torch==2.7.1+cu128", "torchvision==0.22.1+cu128"]
+"rocm" = [
+  "torch==2.7.1+rocm6.3",
+  "torchvision==0.22.1+rocm6.3",
+  "pytorch-triton-rocm",
+]
+
 "onnx" = ["onnxruntime"]
 "onnx-cuda" = ["onnxruntime-gpu"]
 "onnx-directml" = ["onnxruntime-directml"]
@@ -113,7 +121,38 @@ dependencies = [
 # Prevent opencv-python from ever being chosen during dependency resolution.
 # This prevents conflicts with opencv-contrib-python, which Invoke requires.
 override-dependencies = ["opencv-python; sys_platform=='never'"]
+conflicts = [[{ extra = "cpu" }, { extra = "cuda" }, { extra = "rocm" }]]
+index-strategy = "unsafe-best-match"
+
+[tool.uv.sources]
+torch = [
+  { index = "torch-cpu", extra = "cpu" },
+  { index = "torch-cuda", extra = "cuda" },
+  { index = "torch-rocm", extra = "rocm" },
+]
+torchvision = [
+  { index = "torch-cpu", extra = "cpu" },
+  { index = "torch-cuda", extra = "cuda" },
+  { index = "torch-rocm", extra = "rocm" },
+]
+pytorch-triton-rocm = [
+  { index = "torch-rocm", extra = "rocm", marker = "sys_platform == 'linux'" },
+]
+
+[[tool.uv.index]]
+name = "torch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "torch-cuda"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
 
+[[tool.uv.index]]
+name = "torch-rocm"
+url = "https://download.pytorch.org/whl/rocm6.3"
+explicit = true
 
 [project.scripts]
 "invokeai-web" = "invokeai.app.run_app:run_app"
diff --git a/uv.lock b/uv.lock