@@ -109,10 +109,9 @@ steps:
109109 - tests/entrypoints/offline_mode
110110 commands :
111111 - export VLLM_WORKER_MULTIPROC_METHOD=spawn
112- - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/ test_collective_rpc.py
112+ - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
113113 - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
114114 - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
115- - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
116115 - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
117116
118117- label : Entrypoints Test (API Server) # 40min
@@ -234,16 +233,33 @@ steps:
234233 # OOM in the CI unless we run this separately
235234 - pytest -v -s tokenization
236235
237- - label : V1 Test
236+ - label : V1 Test e2e + engine
238237 mirror_hardwares : [amdexperimental]
239238 source_file_dependencies :
240239 - vllm/
241240 - tests/v1
242241 commands :
243- # split the test to avoid interference
244- - pytest -v -s v1/core
242+ # TODO: accuracy does not match, whether setting
243+ # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
244+ - pytest -v -s v1/e2e
245245 - pytest -v -s v1/engine
246+
247+ - label : V1 Test entrypoints
248+ mirror_hardwares : [amdexperimental]
249+ source_file_dependencies :
250+ - vllm/
251+ - tests/v1
252+ commands :
246253 - pytest -v -s v1/entrypoints
254+
255+ - label : V1 Test others
256+ mirror_hardwares : [amdexperimental]
257+ source_file_dependencies :
258+ - vllm/
259+ - tests/v1
260+ commands :
261+ # split the test to avoid interference
262+ - pytest -v -s v1/core
247263 - pytest -v -s v1/executor
248264 - pytest -v -s v1/sample
249265 - pytest -v -s v1/logits_processors
@@ -256,9 +272,6 @@ steps:
256272 - pytest -v -s v1/test_utils.py
257273 - pytest -v -s v1/test_oracle.py
258274 - pytest -v -s v1/test_metrics_reader.py
259- # TODO: accuracy does not match, whether setting
260- # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
261- - pytest -v -s v1/e2e
262275 # Integration test for streaming correctness (requires special branch).
263276 - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
264277 - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -312,7 +325,7 @@ steps:
312325 source_file_dependencies :
313326 - vllm/lora
314327 - tests/lora
315- command : pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
328+ command : pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
316329 parallelism : 4
317330
318331- label : PyTorch Compilation Unit Tests
@@ -449,8 +462,8 @@ steps:
449462 - tests/quantization
450463 commands :
451464 # temporary install here since we need nightly, will move to requirements/test.in
452- # after torchao 0.12 release
453- - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
465+ # after torchao 0.12 release, and pin a working version of torchao nightly here
466+ - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
454467 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
455468
456469- label : LM Eval Small Models # 53min
@@ -553,8 +566,7 @@ steps:
553566 - tests/models/multimodal
554567 commands :
555568 - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
556- - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
557- - pytest -v -s models/multimodal/processing/test_tensor_schema.py
569+ - pytest -v -s models/multimodal/processing
558570
559571- label : Multi-Modal Models Test (Standard)
560572 mirror_hardwares : [amdexperimental]
@@ -654,6 +666,7 @@ steps:
654666 # Quantization
655667 - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
656668 - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
669+ - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
657670 - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
658671 - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
659672 - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -663,6 +676,7 @@ steps:
663676 - pytest -v -s tests/compile/test_fusion_all_reduce.py
664677 - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
665678 - pytest -v -s tests/kernels/moe/test_flashinfer.py
679+ - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
666680
667681# #### 1 GPU test #####
668682# #### multi gpus test #####
@@ -755,6 +769,11 @@ steps:
755769 - pytest -v -s plugins_tests/test_platform_plugins.py
756770 - pip uninstall vllm_add_dummy_platform -y
757771 # end platform plugin tests
772+ # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
773+ - pip install -e ./plugins/prithvi_io_processor_plugin
774+ - pytest -v -s plugins_tests/test_io_processor_plugins.py
775+ - pip uninstall prithvi_io_processor_plugin -y
776+ # end io_processor plugins test
758777 # other tests continue here:
759778 - pytest -v -s plugins_tests/test_scheduler_plugins.py
760779 - pip install -e ./plugins/vllm_add_dummy_model
@@ -791,13 +810,14 @@ steps:
791810 # requires multi-GPU testing for validation.
792811 - pytest -v -s -x lora/test_chatglm3_tp.py
793812 - pytest -v -s -x lora/test_llama_tp.py
794- - pytest -v -s -x lora/test_multi_loras_with_tp .py
813+ - pytest -v -s -x lora/test_llm_with_multi_loras .py
795814
796815
797816- label : Weight Loading Multiple GPU Test # 33min
798817 mirror_hardwares : [amdexperimental]
799818 working_dir : " /vllm-workspace/tests"
800- num_gpus : 2
819+ num_gpus : 2
820+ optional : true
801821 source_file_dependencies :
802822 - vllm/
803823 - tests/weight_loading
0 commit comments