Disable prefix caching when serving a VLM model #4498

Workflow file for this run

.github/workflows/pr_ete_test.yml at 453fb57

	name: pr_ete_test

	on:
	pull_request:
	paths:
	- ".github/workflows/pr_ete_test.yml"
	- "cmake/**"
	- "src/**"
	- "autotest/**"
	- "3rdparty/**"
	- "lmdeploy/**"
	- "requirements/**"
	- "requirements_cuda.txt"
	- "CMakeLists.txt"
	- "setup.py"
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true


	env:
	HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
	HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
	ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
	PYTHONPATH: /nvme/qa_test_models/offline_pkg/LLaVA


	jobs:
	pr_functions_test:
	runs-on: [self-hosted, linux-a100-pr]
	timeout-minutes: 120
	env:
	REPORT_DIR: /nvme/qa_test_models/test-reports
	container:
	image: nvidia/cuda:12.4.1-devel-ubuntu22.04
	options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip --pull never"
	volumes:
	- /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
	- /nvme/share_data/github-actions/packages:/root/packages
	- /nvme/qa_test_models:/nvme/qa_test_models
	- /mnt/187:/mnt/187
	- /mnt/shared:/mnt/shared
	- /mnt/bigdisk:/mnt/bigdisk
	- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
	steps:
	- name: Setup systems
	run: \|
	apt-get update -y && apt-get install -y software-properties-common wget vim git curl &&\
	curl https://sh.rustup.rs -sSf \| sh -s -- -y &&\
	add-apt-repository ppa:deadsnakes/ppa -y && apt-get update -y && apt-get install -y --no-install-recommends \
	ninja-build rapidjson-dev libgoogle-glog-dev gdb python3.10 python3.10-dev python3.10-venv \
	&& apt-get clean -y && rm -rf /var/lib/apt/lists/* && cd /opt && python3 -m venv py3
	echo "PATH=/opt/py3/bin:$PATH" >> "$GITHUB_ENV"
	- name: Clone repository
	uses: actions/checkout@v2
	- name: Install pytorch
	run: \|
	python3 -m pip cache dir
	python3 -m pip install --upgrade pip setuptools==69.5.1
	- name: Build lmdeploy
	run: \|
	cp /nvme/qa_test_models/offline_pkg/openmpi-4.1.5.tar.gz .
	tar xf openmpi-4.1.5.tar.gz && cd openmpi-4.1.5 && ./configure --prefix=/usr/local/openmpi
	make -j$(nproc) && make install && cd .. && rm -rf openmpi-4.1.5*
	export PATH=$PATH:/usr/local/openmpi/bin
	export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib
	# We need to pin transformers version (<4.52.0) to avoid test failures due to breaking changes.
	python3 -m pip install transformers==4.51.3 timm
	python3 -m pip install -r requirements/lite.txt
	python3 -m pip install -r requirements/test.txt
	python3 -m pip install -e .
	# the install packeage from. https://github.com/Dao-AILab/flash-attention/releases
	python3 -m pip install /nvme/qa_test_models/offline_pkg/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
	- name: Check env
	run: \|
	python3 -m pip list
	lmdeploy check_env
	- name: Test lmdeploy
	run: \|
	CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_2' -x --alluredir=allure-results --clean-alluredir
	CUDA_VISIBLE_DEVICES=5,6 pytest autotest -m 'pr_test and gpu_num_1' -n 2 -x --alluredir=allure-results
	- name: Generate reports
	if: always()
	run: \|
	export date_today="$(date +'%Y%m%d-%H%M%S')"
	export report_dir="$REPORT_DIR/$date_today"
	echo "Save report to $report_dir"
	mv allure-results $report_dir
	- name: Clear workfile
	if: always()
	run: \|
	export workdir=$(pwd)
	cd ..
	rm -rf $workdir
	mkdir $workdir
	chmod -R 777 $workdir

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Disable prefix caching when serving a VLM model #4498

Workflow file

Disable prefix caching when serving a VLM model #4498

Uh oh!

Jobs

Run details

Workflow file for this run