@@ -10,12 +10,18 @@ steps:
1010 - image : badouralix/curl-jq
1111 command :
1212 - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
13-
13+ - label : " Cleanup H100"
14+ agents :
15+ queue : H100
16+ depends_on : ~
17+ command : docker system prune -a --volumes --force
18+
1419 - label : " A100"
1520 # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
1621 agents :
1722 queue : A100
1823 depends_on : wait-for-container-image
24+ if : build.branch == "main"
1925 plugins :
2026 - kubernetes :
2127 podSpec :
5056 agents :
5157 queue : H200
5258 depends_on : wait-for-container-image
59+ if : build.branch == "main"
5360 plugins :
5461 - docker#v5.12.0:
5562 image : public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@@ -70,20 +77,99 @@ steps:
7077 # key: block-h100
7178 # depends_on: ~
7279
73- - label : " Cleanup H100"
80+ - label : " H100"
81+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
7482 agents :
7583 queue : H100
76- depends_on : ~
77- command : docker system prune -a --volumes --force
84+ depends_on : wait-for-container-image
85+ if : build.branch == "main"
86+ plugins :
87+ - docker#v5.12.0:
88+ image : public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
89+ command :
90+ - bash
91+ - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
92+ mount-buildkite-agent : true
93+ propagate-environment : true
94+ ipc : host
95+ gpus : all # see CUDA_VISIBLE_DEVICES for actual GPUs used
96+ volumes :
97+ - /data/benchmark-hf-cache:/root/.cache/huggingface
98+ environment :
99+ - VLLM_USAGE_SOURCE
100+ - HF_TOKEN
101+
102+ # Premerge benchmark
103+ - label : " A100"
104+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
105+ agents :
106+ queue : A100
107+ depends_on : wait-for-container-image
108+ if : build.branch != "main"
109+ plugins :
110+ - kubernetes :
111+ podSpec :
112+ priorityClassName : perf-benchmark
113+ containers :
114+ - image : public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
115+ command :
116+ - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
117+ resources :
118+ limits :
119+ nvidia.com/gpu : 8
120+ volumeMounts :
121+ - name : devshm
122+ mountPath : /dev/shm
123+ env :
124+ - name : VLLM_USAGE_SOURCE
125+ value : ci-test
126+ - name : HF_TOKEN
127+ valueFrom :
128+ secretKeyRef :
129+ name : hf-token-secret
130+ key : token
131+ nodeSelector :
132+ nvidia.com/gpu.product : NVIDIA-A100-SXM4-80GB
133+ volumes :
134+ - name : devshm
135+ emptyDir :
136+ medium : Memory
137+
138+ - label : " H200"
139+ # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
140+ agents :
141+ queue : H200
142+ depends_on : wait-for-container-image
143+ if : build.branch != "main"
144+ plugins :
145+ - docker#v5.12.0:
146+ image : public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
147+ command :
148+ - bash
149+ - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
150+ mount-buildkite-agent : true
151+ propagate-environment : true
152+ ipc : host
153+ gpus : 4,5,6,7
154+ volumes :
155+ - /data/benchmark-hf-cache:/root/.cache/huggingface
156+ environment :
157+ - VLLM_USAGE_SOURCE
158+ - HF_TOKEN
159+
160+ # - block: "Run H100 Benchmark"
161+ # key: block-h100
162+ # depends_on: ~
78163
79164 - label : " H100"
80165 # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
81166 agents :
82167 queue : H100
83168 depends_on : wait-for-container-image
169+ if : build.branch != "main"
84170 plugins :
85171 - docker#v5.12.0:
86- image : public.ecr.aws/q9t5s3a7/vllm-ci-postmerge -repo:$BUILDKITE_COMMIT
172+ image : public.ecr.aws/q9t5s3a7/vllm-ci-test -repo:$BUILDKITE_COMMIT
87173 command :
88174 - bash
89175 - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
0 commit comments