Skip to content

Commit 4226b74

Browse files
authored
fix(docker) rocm 6.3 based image (#8152)
1. Fix the run script to properly read the GPU_DRIVER 2. Cloned and adjusted the ROCM dockerbuild for docker 3. Adjust the docker-compose.yml to use the cloned dockerbuild
2 parents 933fb22 + 1424b7c commit 4226b74

File tree

10 files changed

+1325
-360
lines changed

10 files changed

+1325
-360
lines changed

.github/workflows/typegen-checks.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,18 @@ jobs:
3939
- name: checkout
4040
uses: actions/checkout@v4
4141

42+
- name: Free up more disk space on the runner
43+
# https://github.com/actions/runner-images/issues/2840#issuecomment-1284059930
44+
run: |
45+
echo "----- Free space before cleanup"
46+
df -h
47+
sudo rm -rf /usr/share/dotnet
48+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
49+
sudo swapoff /mnt/swapfile
50+
sudo rm -rf /mnt/swapfile
51+
echo "----- Free space after cleanup"
52+
df -h
53+
4254
- name: check for changed files
4355
if: ${{ inputs.always_run != true }}
4456
id: changed-files

docker/.env.sample

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
## GPU_DRIVER can be set to either `cuda` or `rocm` to enable GPU support in the container accordingly.
2323
# GPU_DRIVER=cuda #| rocm
2424

25+
## If you are using ROCM, you will need to ensure that the render group within the container and the host system use the same group ID.
26+
## To obtain the group ID of the render group on the host system, run `getent group render` and grab the number.
27+
# RENDER_GROUP_ID=
28+
2529
## CONTAINER_UID can be set to the UID of the user on the host system that should own the files in the container.
2630
## It is usually not necessary to change this. Use `id -u` on the host system to find the UID.
2731
# CONTAINER_UID=1000

docker/Dockerfile

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ ENV \
4343
UV_MANAGED_PYTHON=1 \
4444
UV_LINK_MODE=copy \
4545
UV_PROJECT_ENVIRONMENT=/opt/venv \
46-
UV_INDEX="https://download.pytorch.org/whl/cu124" \
4746
INVOKEAI_ROOT=/invokeai \
4847
INVOKEAI_HOST=0.0.0.0 \
4948
INVOKEAI_PORT=9090 \
@@ -74,19 +73,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \
7473
--mount=type=bind,source=uv.lock,target=uv.lock \
7574
# this is just to get the package manager to recognize that the project exists, without making changes to the docker layer
7675
--mount=type=bind,source=invokeai/version,target=invokeai/version \
77-
if [ "$TARGETPLATFORM" = "linux/arm64" ] || [ "$GPU_DRIVER" = "cpu" ]; then UV_INDEX="https://download.pytorch.org/whl/cpu"; \
78-
elif [ "$GPU_DRIVER" = "rocm" ]; then UV_INDEX="https://download.pytorch.org/whl/rocm6.2"; \
79-
fi && \
80-
uv sync --frozen
81-
82-
# build patchmatch
83-
RUN cd /usr/lib/$(uname -p)-linux-gnu/pkgconfig/ && ln -sf opencv4.pc opencv.pc
84-
RUN python -c "from patchmatch import patch_match"
76+
ulimit -n 30000 && \
77+
uv sync --extra $GPU_DRIVER --frozen
8578

8679
# Link amdgpu.ids for ROCm builds
8780
# contributed by https://github.com/Rubonnek
8881
RUN mkdir -p "/opt/amdgpu/share/libdrm" &&\
89-
ln -s "/usr/share/libdrm/amdgpu.ids" "/opt/amdgpu/share/libdrm/amdgpu.ids"
82+
ln -s "/usr/share/libdrm/amdgpu.ids" "/opt/amdgpu/share/libdrm/amdgpu.ids" && groupadd render
83+
84+
# build patchmatch
85+
RUN cd /usr/lib/$(uname -p)-linux-gnu/pkgconfig/ && ln -sf opencv4.pc opencv.pc
86+
RUN python -c "from patchmatch import patch_match"
9087

9188
RUN mkdir -p ${INVOKEAI_ROOT} && chown -R ${CONTAINER_UID}:${CONTAINER_GID} ${INVOKEAI_ROOT}
9289

@@ -105,8 +102,6 @@ COPY invokeai ${INVOKEAI_SRC}/invokeai
105102
RUN --mount=type=cache,target=/root/.cache/uv \
106103
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
107104
--mount=type=bind,source=uv.lock,target=uv.lock \
108-
if [ "$TARGETPLATFORM" = "linux/arm64" ] || [ "$GPU_DRIVER" = "cpu" ]; then UV_INDEX="https://download.pytorch.org/whl/cpu"; \
109-
elif [ "$GPU_DRIVER" = "rocm" ]; then UV_INDEX="https://download.pytorch.org/whl/rocm6.2"; \
110-
fi && \
111-
uv pip install -e .
105+
ulimit -n 30000 && \
106+
uv pip install -e .[$GPU_DRIVER]
112107

docker/Dockerfile-rocm-full

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# syntax=docker/dockerfile:1.4
2+
3+
#### Web UI ------------------------------------
4+
5+
FROM docker.io/node:22-slim AS web-builder
6+
ENV PNPM_HOME="/pnpm"
7+
ENV PATH="$PNPM_HOME:$PATH"
8+
RUN corepack use [email protected]
9+
RUN corepack enable
10+
11+
WORKDIR /build
12+
COPY invokeai/frontend/web/ ./
13+
RUN --mount=type=cache,target=/pnpm/store \
14+
pnpm install --frozen-lockfile
15+
RUN npx vite build
16+
17+
## Backend ---------------------------------------
18+
19+
FROM library/ubuntu:24.04
20+
21+
ARG DEBIAN_FRONTEND=noninteractive
22+
RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
23+
RUN --mount=type=cache,target=/var/cache/apt \
24+
--mount=type=cache,target=/var/lib/apt \
25+
apt update && apt install -y --no-install-recommends \
26+
ca-certificates \
27+
git \
28+
gosu \
29+
libglib2.0-0 \
30+
libgl1 \
31+
libglx-mesa0 \
32+
build-essential \
33+
libopencv-dev \
34+
libstdc++-10-dev \
35+
wget
36+
37+
ENV \
38+
PYTHONUNBUFFERED=1 \
39+
PYTHONDONTWRITEBYTECODE=1 \
40+
VIRTUAL_ENV=/opt/venv \
41+
INVOKEAI_SRC=/opt/invokeai \
42+
PYTHON_VERSION=3.12 \
43+
UV_PYTHON=3.12 \
44+
UV_COMPILE_BYTECODE=1 \
45+
UV_MANAGED_PYTHON=1 \
46+
UV_LINK_MODE=copy \
47+
UV_PROJECT_ENVIRONMENT=/opt/venv \
48+
INVOKEAI_ROOT=/invokeai \
49+
INVOKEAI_HOST=0.0.0.0 \
50+
INVOKEAI_PORT=9090 \
51+
PATH="/opt/venv/bin:$PATH" \
52+
CONTAINER_UID=${CONTAINER_UID:-1000} \
53+
CONTAINER_GID=${CONTAINER_GID:-1000}
54+
55+
ARG GPU_DRIVER=cuda
56+
57+
# Install `uv` for package management
58+
COPY --from=ghcr.io/astral-sh/uv:0.6.9 /uv /uvx /bin/
59+
60+
# Install python & allow non-root user to use it by traversing the /root dir without read permissions
61+
RUN --mount=type=cache,target=/root/.cache/uv \
62+
uv python install ${PYTHON_VERSION} && \
63+
# chmod --recursive a+rX /root/.local/share/uv/python
64+
chmod 711 /root
65+
66+
WORKDIR ${INVOKEAI_SRC}
67+
68+
# Install project's dependencies as a separate layer so they aren't rebuilt every commit.
69+
# bind-mount instead of copy to defer adding sources to the image until next layer.
70+
#
71+
# NOTE: there are no pytorch builds for arm64 + cuda, only cpu
72+
# x86_64/CUDA is the default
73+
RUN --mount=type=cache,target=/root/.cache/uv \
74+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
75+
--mount=type=bind,source=uv.lock,target=uv.lock \
76+
# this is just to get the package manager to recognize that the project exists, without making changes to the docker layer
77+
--mount=type=bind,source=invokeai/version,target=invokeai/version \
78+
ulimit -n 30000 && \
79+
uv sync --extra $GPU_DRIVER --frozen
80+
81+
RUN --mount=type=cache,target=/var/cache/apt \
82+
--mount=type=cache,target=/var/lib/apt \
83+
if [ "$GPU_DRIVER" = "rocm" ]; then \
84+
wget -O /tmp/amdgpu-install.deb \
85+
https://repo.radeon.com/amdgpu-install/6.3.4/ubuntu/noble/amdgpu-install_6.3.60304-1_all.deb && \
86+
apt install -y /tmp/amdgpu-install.deb && \
87+
apt update && \
88+
amdgpu-install --usecase=rocm -y && \
89+
apt-get autoclean && \
90+
apt clean && \
91+
rm -rf /tmp/* /var/tmp/* && \
92+
usermod -a -G render ubuntu && \
93+
usermod -a -G video ubuntu && \
94+
echo "\\n/opt/rocm/lib\\n/opt/rocm/lib64" >> /etc/ld.so.conf.d/rocm.conf && \
95+
ldconfig && \
96+
update-alternatives --auto rocm; \
97+
fi
98+
99+
## Heathen711: Leaving this for review input, will remove before merge
100+
# RUN --mount=type=cache,target=/var/cache/apt \
101+
# --mount=type=cache,target=/var/lib/apt \
102+
# if [ "$GPU_DRIVER" = "rocm" ]; then \
103+
# groupadd render && \
104+
# usermod -a -G render ubuntu && \
105+
# usermod -a -G video ubuntu; \
106+
# fi
107+
108+
## Link amdgpu.ids for ROCm builds
109+
## contributed by https://github.com/Rubonnek
110+
# RUN mkdir -p "/opt/amdgpu/share/libdrm" &&\
111+
# ln -s "/usr/share/libdrm/amdgpu.ids" "/opt/amdgpu/share/libdrm/amdgpu.ids"
112+
113+
# build patchmatch
114+
RUN cd /usr/lib/$(uname -p)-linux-gnu/pkgconfig/ && ln -sf opencv4.pc opencv.pc
115+
RUN python -c "from patchmatch import patch_match"
116+
117+
RUN mkdir -p ${INVOKEAI_ROOT} && chown -R ${CONTAINER_UID}:${CONTAINER_GID} ${INVOKEAI_ROOT}
118+
119+
COPY docker/docker-entrypoint.sh ./
120+
ENTRYPOINT ["/opt/invokeai/docker-entrypoint.sh"]
121+
CMD ["invokeai-web"]
122+
123+
# --link requires buldkit w/ dockerfile syntax 1.4, does not work with podman
124+
COPY --link --from=web-builder /build/dist ${INVOKEAI_SRC}/invokeai/frontend/web/dist
125+
126+
# add sources last to minimize image changes on code changes
127+
COPY invokeai ${INVOKEAI_SRC}/invokeai
128+
129+
# this should not increase image size because we've already installed dependencies
130+
# in a previous layer
131+
RUN --mount=type=cache,target=/root/.cache/uv \
132+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
133+
--mount=type=bind,source=uv.lock,target=uv.lock \
134+
ulimit -n 30000 && \
135+
uv pip install -e .[$GPU_DRIVER]
136+

docker/docker-compose.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,9 @@ services:
4747

4848
invokeai-rocm:
4949
<<: *invokeai
50-
devices:
51-
- /dev/kfd:/dev/kfd
52-
- /dev/dri:/dev/dri
50+
environment:
51+
- AMD_VISIBLE_DEVICES=all
52+
- RENDER_GROUP_ID=${RENDER_GROUP_ID}
53+
runtime: amd
5354
profiles:
5455
- rocm

docker/docker-entrypoint.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,17 @@ _=$(id ${USER} 2>&1) || useradd -u ${USER_ID} ${USER}
2121
# ensure the UID is correct
2222
usermod -u ${USER_ID} ${USER} 1>/dev/null
2323

24+
## ROCM specific configuration
25+
# render group within the container must match the host render group
26+
# otherwise the container will not be able to access the host GPU.
27+
if [[ -v "RENDER_GROUP_ID" ]] && [[ ! -z "${RENDER_GROUP_ID}" ]]; then
28+
# ensure the render group exists
29+
groupmod -g ${RENDER_GROUP_ID} render
30+
usermod -a -G render ${USER}
31+
usermod -a -G video ${USER}
32+
fi
33+
34+
2435
### Set the $PUBLIC_KEY env var to enable SSH access.
2536
# We do not install openssh-server in the image by default to avoid bloat.
2637
# but it is useful to have the full SSH server e.g. on Runpod.

docker/run.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ run() {
1313

1414
# parse .env file for build args
1515
build_args=$(awk '$1 ~ /=[^$]/ && $0 !~ /^#/ {print "--build-arg " $0 " "}' .env) &&
16-
profile="$(awk -F '=' '/GPU_DRIVER/ {print $2}' .env)"
16+
profile="$(awk -F '=' '/GPU_DRIVER=/ {print $2}' .env)"
1717

1818
# default to 'cuda' profile
1919
[[ -z "$profile" ]] && profile="cuda"
@@ -30,7 +30,7 @@ run() {
3030

3131
printf "%s\n" "starting service $service_name"
3232
docker compose --profile "$profile" up -d "$service_name"
33-
docker compose logs -f
33+
docker compose --profile "$profile" logs -f
3434
}
3535

3636
run

docs/installation/manual.md

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -69,34 +69,34 @@ The following commands vary depending on the version of Invoke being installed a
6969
- If you have an Nvidia 20xx series GPU or older, use `invokeai[xformers]`.
7070
- If you have an Nvidia 30xx series GPU or newer, or do not have an Nvidia GPU, use `invokeai`.
7171
72-
7. Determine the `PyPI` index URL to use for installation, if any. This is necessary to get the right version of torch installed.
72+
7. Determine the torch backend to use for installation, if any. This is necessary to get the right version of torch installed. This is acheived by using [UV's built in torch support.](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection)
7373

7474
=== "Invoke v5.12 and later"
7575

76-
- If you are on Windows or Linux with an Nvidia GPU, use `https://download.pytorch.org/whl/cu128`.
77-
- If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
78-
- If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm6.2.4`.
79-
- **In all other cases, do not use an index.**
76+
- If you are on Windows or Linux with an Nvidia GPU, use `--torch-backend=cu128`.
77+
- If you are on Linux with no GPU, use `--torch-backend=cpu`.
78+
- If you are on Linux with an AMD GPU, use `--torch-backend=rocm6.3`.
79+
- **In all other cases, do not use a torch backend.**
8080

8181
=== "Invoke v5.10.0 to v5.11.0"
8282

83-
- If you are on Windows or Linux with an Nvidia GPU, use `https://download.pytorch.org/whl/cu126`.
84-
- If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
85-
- If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm6.2.4`.
83+
- If you are on Windows or Linux with an Nvidia GPU, use `--torch-backend=cu126`.
84+
- If you are on Linux with no GPU, use `--torch-backend=cpu`.
85+
- If you are on Linux with an AMD GPU, use `--torch-backend=rocm6.2.4`.
8686
- **In all other cases, do not use an index.**
8787

8888
=== "Invoke v5.0.0 to v5.9.1"
8989

90-
- If you are on Windows with an Nvidia GPU, use `https://download.pytorch.org/whl/cu124`.
91-
- If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
92-
- If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm6.1`.
90+
- If you are on Windows with an Nvidia GPU, use `--torch-backend=cu124`.
91+
- If you are on Linux with no GPU, use `--torch-backend=cpu`.
92+
- If you are on Linux with an AMD GPU, use `--torch-backend=rocm6.1`.
9393
- **In all other cases, do not use an index.**
9494

9595
=== "Invoke v4"
9696

97-
- If you are on Windows with an Nvidia GPU, use `https://download.pytorch.org/whl/cu124`.
98-
- If you are on Linux with no GPU, use `https://download.pytorch.org/whl/cpu`.
99-
- If you are on Linux with an AMD GPU, use `https://download.pytorch.org/whl/rocm5.2`.
97+
- If you are on Windows with an Nvidia GPU, use `--torch-backend=cu124`.
98+
- If you are on Linux with no GPU, use `--torch-backend=cpu`.
99+
- If you are on Linux with an AMD GPU, use `--torch-backend=rocm5.2`.
100100
- **In all other cases, do not use an index.**
101101

102102
8. Install the `invokeai` package. Substitute the package specifier and version.
@@ -105,10 +105,10 @@ The following commands vary depending on the version of Invoke being installed a
105105
uv pip install <PACKAGE_SPECIFIER>==<VERSION> --python 3.12 --python-preference only-managed --force-reinstall
106106
```
107107

108-
If you determined you needed to use a `PyPI` index URL in the previous step, you'll need to add `--index=<INDEX_URL>` like this:
108+
If you determined you needed to use a torch backend in the previous step, you'll need to set the backend like this:
109109
110110
```sh
111-
uv pip install <PACKAGE_SPECIFIER>==<VERSION> --python 3.12 --python-preference only-managed --index=<INDEX_URL> --force-reinstall
111+
uv pip install <PACKAGE_SPECIFIER>==<VERSION> --python 3.12 --python-preference only-managed --torch-backend=<VERSION> --force-reinstall
112112
```
113113
114114
9. Deactivate and reactivate your venv so that the invokeai-specific commands become available in the environment:

pyproject.toml

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ dependencies = [
7373
"pypatchmatch",
7474
"python-multipart",
7575
"requests",
76-
"semver~=3.0.1"
76+
"semver~=3.0.1",
7777
]
7878

7979
[project.optional-dependencies]
@@ -83,6 +83,14 @@ dependencies = [
8383
# torch 2.4+cu carries its own triton dependency
8484
]
8585

86+
"cpu" = ["torch==2.7.1+cpu", "torchvision==0.22.1+cpu"]
87+
"cuda" = ["torch==2.7.1+cu128", "torchvision==0.22.1+cu128"]
88+
"rocm" = [
89+
"torch==2.7.1+rocm6.3",
90+
"torchvision==0.22.1+rocm6.3",
91+
"pytorch-triton-rocm",
92+
]
93+
8694
"onnx" = ["onnxruntime"]
8795
"onnx-cuda" = ["onnxruntime-gpu"]
8896
"onnx-directml" = ["onnxruntime-directml"]
@@ -113,7 +121,38 @@ dependencies = [
113121
# Prevent opencv-python from ever being chosen during dependency resolution.
114122
# This prevents conflicts with opencv-contrib-python, which Invoke requires.
115123
override-dependencies = ["opencv-python; sys_platform=='never'"]
124+
conflicts = [[{ extra = "cpu" }, { extra = "cuda" }, { extra = "rocm" }]]
125+
index-strategy = "unsafe-best-match"
126+
127+
[tool.uv.sources]
128+
torch = [
129+
{ index = "torch-cpu", extra = "cpu" },
130+
{ index = "torch-cuda", extra = "cuda" },
131+
{ index = "torch-rocm", extra = "rocm" },
132+
]
133+
torchvision = [
134+
{ index = "torch-cpu", extra = "cpu" },
135+
{ index = "torch-cuda", extra = "cuda" },
136+
{ index = "torch-rocm", extra = "rocm" },
137+
]
138+
pytorch-triton-rocm = [
139+
{ index = "torch-rocm", extra = "rocm", marker = "sys_platform == 'linux'" },
140+
]
141+
142+
[[tool.uv.index]]
143+
name = "torch-cpu"
144+
url = "https://download.pytorch.org/whl/cpu"
145+
explicit = true
146+
147+
[[tool.uv.index]]
148+
name = "torch-cuda"
149+
url = "https://download.pytorch.org/whl/cu128"
150+
explicit = true
116151

152+
[[tool.uv.index]]
153+
name = "torch-rocm"
154+
url = "https://download.pytorch.org/whl/rocm6.3"
155+
explicit = true
117156

118157
[project.scripts]
119158
"invokeai-web" = "invokeai.app.run_app:run_app"

0 commit comments

Comments
 (0)