From 00f167a4069462f59f4f01a45f859a70ebf442c8 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 27 Aug 2025 21:30:09 -0700 Subject: [PATCH 1/9] [CI] Fail subprocess tests with root-cause error Signed-off-by: Nick Hill --- requirements/test.in | 1 + requirements/test.txt | 4 +- tests/conftest.py | 10 ++++ tests/utils.py | 111 ++++++++++++++++++++++++++++++++---------- 4 files changed, 98 insertions(+), 28 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 92c577c50163..5e4280adb752 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests +tblib # for pickling test exceptions timm >=1.0.17 # required for internvl and gemma3n-mm test torch==2.7.1 torchaudio==2.7.1 diff --git a/requirements/test.txt b/requirements/test.txt index 0c27c9bb67e8..cbb7c60256e6 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -137,7 +137,7 @@ contourpy==1.3.0 # via matplotlib cramjam==2.9.0 # via fastparquet -cupy-cuda12x==13.3.0 +cupy-cuda12x==13.6.0 # via ray cycler==0.12.1 # via matplotlib @@ -1032,6 +1032,8 @@ tabledata==1.3.3 # via pytablewriter tabulate==0.9.0 # via sacrebleu +tblib==3.1.0 + # via -r requirements/test.in tcolorpy==0.1.6 # via pytablewriter tenacity==9.0.0 diff --git a/tests/conftest.py b/tests/conftest.py index f8bfdfc8e625..c4558d8fcbc0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# ruff: noqa + +from tblib import pickling_support + +# Install support for pickling exceptions so that we can nicely propagate +# failures from tests running in a subprocess. +# This should be run before any custom exception subclasses are defined. +pickling_support.install() + import json import os import tempfile diff --git a/tests/utils.py b/tests/utils.py index 9d2073f3c103..27d9ea733b08 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import contextlib import copy import functools import importlib @@ -799,43 +800,99 @@ def wait_for_gpu_memory_to_clear(*, def fork_new_process_for_each_test( - f: Callable[_P, None]) -> Callable[_P, None]: + func: Callable[_P, None]) -> Callable[_P, None]: """Decorator to fork a new process for each test function. See https://github.com/vllm-project/vllm/issues/7053 for more details. """ - @functools.wraps(f) + @functools.wraps(func) def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: # Make the process the leader of its own process group # to avoid sending SIGTERM to the parent process os.setpgrp() from _pytest.outcomes import Skipped - pid = os.fork() - print(f"Fork a new process to run a test {pid}") - if pid == 0: - try: - f(*args, **kwargs) - except Skipped as e: - # convert Skipped to exit code 0 - print(str(e)) - os._exit(0) - except Exception: - import traceback - traceback.print_exc() - os._exit(1) + + # Create a unique temporary file to store exception info from child + # process. Use test function name and process ID to avoid collisions. + with tempfile.NamedTemporaryFile( + mode='w+b', + prefix=f"vllm_test_{func.__name__}_{os.getpid()}_", + suffix=".exc") as exc_file: + exc_file_path = exc_file.name + + pid = os.fork() + print(f"Fork a new process to run a test {pid}") + if pid == 0: + try: + func(*args, **kwargs) + except Skipped as e: + # convert Skipped to exit code 0 + print(str(e)) + os._exit(0) + except Exception as e: + import traceback + tb_string = traceback.format_exc() + + # Try to serialize the exception object first + try: + # First, try to pickle the actual exception with + # its traceback. + exc_to_serialize = {'pickled_exception': e} + # Test if it can be pickled + cloudpickle.dumps(exc_to_serialize) + except Exception: + # Fall back to string-based approach + exc_to_serialize = { + 'exception_type': type(e).__name__, + 'exception_msg': str(e), + 'traceback': tb_string, + } + try: + with open(exc_file_path, 'wb') as f: + cloudpickle.dump(exc_to_serialize, f) + except Exception: + # Fallback: just print the traceback. + traceback.print_exc() + os._exit(1) + else: + os._exit(0) else: - os._exit(0) - else: - pgid = os.getpgid(pid) - _pid, _exitcode = os.waitpid(pid, 0) - # ignore SIGTERM signal itself - old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) - # kill all child processes - os.killpg(pgid, signal.SIGTERM) - # restore the signal handler - signal.signal(signal.SIGTERM, old_signal_handler) - assert _exitcode == 0, (f"function {f} failed when called with" - f" args {args} and kwargs {kwargs}") + pgid = os.getpgid(pid) + _pid, _exitcode = os.waitpid(pid, 0) + # ignore SIGTERM signal itself + old_signal_handler = signal.signal(signal.SIGTERM, + signal.SIG_IGN) + # kill all child processes + os.killpg(pgid, signal.SIGTERM) + # restore the signal handler + signal.signal(signal.SIGTERM, old_signal_handler) + if _exitcode != 0: + # Try to read the exception from the child process + exc_info = {} + if os.path.exists(exc_file_path): + with contextlib.suppress(Exception), \ + open(exc_file_path, 'rb') as f: + exc_info = cloudpickle.load(f) + + if (original_exception := exc_info.get( + 'pickled_exception')) is not None: + # Re-raise the actual exception object if it was + # successfully pickled. + assert isinstance(original_exception, Exception) + raise original_exception + + if (original_tb := exc_info.get("traceback")) is not None: + # Use string-based traceback for fallback case + raise AssertionError( + f"Test {func.__name__} failed when called with" + f" args {args} and kwargs {kwargs}" + f" (exit code: {_exitcode}):\n{original_tb}") + else: + # Fallback to the original generic error + raise AssertionError( + f"function {func.__name__} failed when called with" + f" args {args} and kwargs {kwargs}" + f" (exit code: {_exitcode})") return wrapper From 5557740a2f810d7250685431835f3f96f0766662 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 28 Aug 2025 07:43:31 -0700 Subject: [PATCH 2/9] only delete temp file in parent process Signed-off-by: Nick Hill --- tests/utils.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 27d9ea733b08..e12b6d73fd09 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -14,7 +14,7 @@ import tempfile import time import warnings -from contextlib import contextmanager, suppress +from contextlib import ExitStack, contextmanager, suppress from multiprocessing import Process from pathlib import Path from typing import Any, Callable, Literal, Optional, Union @@ -815,14 +815,19 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: # Create a unique temporary file to store exception info from child # process. Use test function name and process ID to avoid collisions. with tempfile.NamedTemporaryFile( + delete=False, mode='w+b', prefix=f"vllm_test_{func.__name__}_{os.getpid()}_", - suffix=".exc") as exc_file: + suffix=".exc") as exc_file, ExitStack() as delete_after: exc_file_path = exc_file.name + delete_after.callback(os.remove, exc_file_path) pid = os.fork() print(f"Fork a new process to run a test {pid}") if pid == 0: + # Parent process responsible for deleting, don't delete + # in child. + delete_after.pop_all() try: func(*args, **kwargs) except Skipped as e: @@ -841,7 +846,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: # Test if it can be pickled cloudpickle.dumps(exc_to_serialize) except Exception: - # Fall back to string-based approach + # Fall back to string-based approach. exc_to_serialize = { 'exception_type': type(e).__name__, 'exception_msg': str(e), @@ -874,8 +879,8 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: open(exc_file_path, 'rb') as f: exc_info = cloudpickle.load(f) - if (original_exception := exc_info.get( - 'pickled_exception')) is not None: + if (original_exception := + exc_info.get('pickled_exception')) is not None: # Re-raise the actual exception object if it was # successfully pickled. assert isinstance(original_exception, Exception) From 545294feb56a97edd6aee48f16e1993b2243332a Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 28 Aug 2025 08:11:15 -0700 Subject: [PATCH 3/9] fix pre-commit Signed-off-by: Nick Hill --- tests/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utils.py b/tests/utils.py index e12b6d73fd09..529b99af015c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -839,6 +839,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: tb_string = traceback.format_exc() # Try to serialize the exception object first + exc_to_serialize: dict[str, Any] try: # First, try to pickle the actual exception with # its traceback. From 26133f0b10b7e57ec6d427dd0ab7fffadadc4565 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 28 Aug 2025 18:09:13 -0700 Subject: [PATCH 4/9] minor Signed-off-by: Nick Hill --- tests/utils.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 529b99af015c..3d931e745749 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -846,7 +846,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: exc_to_serialize = {'pickled_exception': e} # Test if it can be pickled cloudpickle.dumps(exc_to_serialize) - except Exception: + except (Exception, KeyboardInterrupt): # Fall back to string-based approach. exc_to_serialize = { 'exception_type': type(e).__name__, @@ -858,7 +858,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: cloudpickle.dump(exc_to_serialize, f) except Exception: # Fallback: just print the traceback. - traceback.print_exc() + print(tb_string) os._exit(1) else: os._exit(0) @@ -892,13 +892,14 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: raise AssertionError( f"Test {func.__name__} failed when called with" f" args {args} and kwargs {kwargs}" - f" (exit code: {_exitcode}):\n{original_tb}") - else: - # Fallback to the original generic error - raise AssertionError( - f"function {func.__name__} failed when called with" - f" args {args} and kwargs {kwargs}" - f" (exit code: {_exitcode})") + f" (exit code: {_exitcode}):\n{original_tb}" + ) from None + + # Fallback to the original generic error + raise AssertionError( + f"function {func.__name__} failed when called with" + f" args {args} and kwargs {kwargs}" + f" (exit code: {_exitcode})") from None return wrapper From c8e1e327dc466833a400666298b9fe329ea6bdb9 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 29 Aug 2025 15:23:29 -0700 Subject: [PATCH 5/9] fix ray distributed executor destructor error Signed-off-by: Nick Hill --- vllm/executor/ray_distributed_executor.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 37c3fe59c65d..78d0ee6c1e3f 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -117,10 +117,12 @@ def _init_executor(self) -> None: self.driver_worker.execute_method) def shutdown(self) -> None: - logger.info( - "Shutting down Ray distributed executor. If you see error log " - "from logging.cc regarding SIGTERM received, please ignore because " - "this is the expected termination process in Ray.") + if logger: + # Somehow logger can be None here. + logger.info( + "Shutting down Ray distributed executor. If you see error log " + "from logging.cc regarding SIGTERM received, please ignore " + "because this is the expected termination process in Ray.") if hasattr(self, "forward_dag") and self.forward_dag is not None: self.forward_dag.teardown() import ray From 970465f96674afabbdf398d2b26d30c4eec6e5bc Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 5 Sep 2025 14:48:21 -0700 Subject: [PATCH 6/9] add timeout to hanging test Signed-off-by: Nick Hill --- tests/async_engine/test_api_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 90f63e7ea17d..41058f8a48dc 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -52,6 +52,7 @@ def api_server(distributed_executor_backend: str): uvicorn_process.terminate() +@pytest.mark.timeout(300) @pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) def test_api_server(api_server, distributed_executor_backend: str): """ From a5b79e273c65816c7c173048ed01c5b9b581135c Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 5 Sep 2025 14:48:33 -0700 Subject: [PATCH 7/9] add env var for nccl debug Signed-off-by: Nick Hill --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 55349e0ac932..48f12d5924a3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -156,7 +156,7 @@ steps: - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - NCCL_DEBUG=INFO TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py From ef248cfdaea226c4f248e6738c2c203c57be93f9 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 8 Sep 2025 21:08:42 -0700 Subject: [PATCH 8/9] try some things Signed-off-by: Nick Hill --- .buildkite/test-pipeline.yaml | 1 + tests/async_engine/test_api_server.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3b9a92487484..72a80c010b30 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -162,6 +162,7 @@ steps: # test with tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with internal dp + - export NCCL_CUMEM_ENABLE=0 - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - NCCL_DEBUG=INFO TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 41058f8a48dc..07370a880329 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copyreg import os import subprocess import sys @@ -10,6 +11,30 @@ import pytest import requests +import urllib3.exceptions + + +def _pickle_new_connection_error(obj): + """Custom pickler for NewConnectionError to fix tblib compatibility.""" + # Extract the original message by removing the "conn: " prefix + full_message = obj.args[0] if obj.args else "" + if ': ' in full_message: + # Split off the connection part and keep the actual message + _, actual_message = full_message.split(': ', 1) + else: + actual_message = full_message + return _unpickle_new_connection_error, (actual_message, ) + + +def _unpickle_new_connection_error(message): + """Custom unpickler for NewConnectionError.""" + # Create with None as conn and the actual message + return urllib3.exceptions.NewConnectionError(None, message) + + +# Register the custom pickle/unpickle functions for tblib compatibility +copyreg.pickle(urllib3.exceptions.NewConnectionError, + _pickle_new_connection_error) def _query_server(prompt: str, max_tokens: int = 5) -> dict: From d14cbacca114ec51b3fe6fefa47a6ef316028124 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 9 Sep 2025 09:14:03 -0700 Subject: [PATCH 9/9] revert debug changes Signed-off-by: Nick Hill --- .buildkite/test-pipeline.yaml | 3 +-- tests/conftest.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 72a80c010b30..b0d4c4456d33 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -162,9 +162,8 @@ steps: # test with tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with internal dp - - export NCCL_CUMEM_ENABLE=0 - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - NCCL_DEBUG=INFO TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py diff --git a/tests/conftest.py b/tests/conftest.py index 3a13e8cfbade..0440e859fe02 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1365,7 +1365,7 @@ def get_image_asset(self, name: str) -> Image.Image: @pytest.fixture(scope="session") def local_asset_server() -> Generator[LocalAssetServer, None, None]: """ - Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. + Starts a thread based HTTP server bound to 127.0.0.1 on a random free port. The server currently servers images at: http://127.0.0.1:/. """