Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
00f167a
[CI] Fail subprocess tests with root-cause error
njhill Aug 28, 2025
5557740
only delete temp file in parent process
njhill Aug 28, 2025
7923e66
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Aug 28, 2025
545294f
fix pre-commit
njhill Aug 28, 2025
cb95464
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Aug 28, 2025
26133f0
minor
njhill Aug 29, 2025
1127de0
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Aug 29, 2025
e5476c9
Merge remote-tracking branch 'refs/remotes/origin/main' into root-cau…
njhill Aug 29, 2025
c8e1e32
fix ray distributed executor destructor error
njhill Aug 29, 2025
1d0e395
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Aug 31, 2025
57be627
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Sep 2, 2025
97833c2
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Sep 3, 2025
2a37c8d
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Sep 3, 2025
4f5cde4
Merge remote-tracking branch 'refs/remotes/origin/main' into root-cau…
njhill Sep 5, 2025
970465f
add timeout to hanging test
njhill Sep 5, 2025
a5b79e2
add env var for nccl debug
njhill Sep 5, 2025
c12fce9
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Sep 9, 2025
ef248cf
try some things
njhill Sep 9, 2025
d14cbac
revert debug changes
njhill Sep 9, 2025
605d205
Merge remote-tracking branch 'refs/remotes/origin/main' into root-cau…
njhill Sep 9, 2025
6255133
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Sep 10, 2025
587b0a2
Merge remote-tracking branch 'origin/main' into root-cause-failures
njhill Sep 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements/test.in
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
sentence-transformers # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
tblib # for pickling test exceptions
timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.7.1
torchaudio==2.7.1
Expand Down
4 changes: 3 additions & 1 deletion requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ contourpy==1.3.0
# via matplotlib
cramjam==2.9.0
# via fastparquet
cupy-cuda12x==13.3.0
cupy-cuda12x==13.6.0
# via ray
cycler==0.12.1
# via matplotlib
Expand Down Expand Up @@ -1032,6 +1032,8 @@ tabledata==1.3.3
# via pytablewriter
tabulate==0.9.0
# via sacrebleu
tblib==3.1.0
# via -r requirements/test.in
tcolorpy==0.1.6
# via pytablewriter
tenacity==9.0.0
Expand Down
10 changes: 10 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

# ruff: noqa

from tblib import pickling_support

# Install support for pickling exceptions so that we can nicely propagate
# failures from tests running in a subprocess.
# This should be run before any custom exception subclasses are defined.
pickling_support.install()

import json
import math
import os
Expand Down
120 changes: 92 additions & 28 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import asyncio
import contextlib
import copy
import functools
import importlib
Expand All @@ -13,7 +14,7 @@
import tempfile
import time
import warnings
from contextlib import contextmanager, suppress
from contextlib import ExitStack, contextmanager, suppress
from multiprocessing import Process
from pathlib import Path
from typing import Any, Callable, Literal, Optional, Union
Expand Down Expand Up @@ -799,43 +800,106 @@ def wait_for_gpu_memory_to_clear(*,


def fork_new_process_for_each_test(
f: Callable[_P, None]) -> Callable[_P, None]:
func: Callable[_P, None]) -> Callable[_P, None]:
"""Decorator to fork a new process for each test function.
See https://github.com/vllm-project/vllm/issues/7053 for more details.
"""

@functools.wraps(f)
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
# Make the process the leader of its own process group
# to avoid sending SIGTERM to the parent process
os.setpgrp()
from _pytest.outcomes import Skipped
pid = os.fork()
print(f"Fork a new process to run a test {pid}")
if pid == 0:
try:
f(*args, **kwargs)
except Skipped as e:
# convert Skipped to exit code 0
print(str(e))
os._exit(0)
except Exception:
import traceback
traceback.print_exc()
os._exit(1)

# Create a unique temporary file to store exception info from child
# process. Use test function name and process ID to avoid collisions.
with tempfile.NamedTemporaryFile(
delete=False,
mode='w+b',
prefix=f"vllm_test_{func.__name__}_{os.getpid()}_",
suffix=".exc") as exc_file, ExitStack() as delete_after:
exc_file_path = exc_file.name
delete_after.callback(os.remove, exc_file_path)

pid = os.fork()
print(f"Fork a new process to run a test {pid}")
if pid == 0:
# Parent process responsible for deleting, don't delete
# in child.
delete_after.pop_all()
try:
func(*args, **kwargs)
except Skipped as e:
# convert Skipped to exit code 0
print(str(e))
os._exit(0)
except Exception as e:
import traceback
tb_string = traceback.format_exc()

# Try to serialize the exception object first
exc_to_serialize: dict[str, Any]
try:
# First, try to pickle the actual exception with
# its traceback.
exc_to_serialize = {'pickled_exception': e}
# Test if it can be pickled
cloudpickle.dumps(exc_to_serialize)
except (Exception, KeyboardInterrupt):
# Fall back to string-based approach.
exc_to_serialize = {
'exception_type': type(e).__name__,
'exception_msg': str(e),
'traceback': tb_string,
}
try:
with open(exc_file_path, 'wb') as f:
cloudpickle.dump(exc_to_serialize, f)
except Exception:
# Fallback: just print the traceback.
print(tb_string)
os._exit(1)
else:
os._exit(0)
else:
os._exit(0)
else:
pgid = os.getpgid(pid)
_pid, _exitcode = os.waitpid(pid, 0)
# ignore SIGTERM signal itself
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
# kill all child processes
os.killpg(pgid, signal.SIGTERM)
# restore the signal handler
signal.signal(signal.SIGTERM, old_signal_handler)
assert _exitcode == 0, (f"function {f} failed when called with"
f" args {args} and kwargs {kwargs}")
pgid = os.getpgid(pid)
_pid, _exitcode = os.waitpid(pid, 0)
# ignore SIGTERM signal itself
old_signal_handler = signal.signal(signal.SIGTERM,
signal.SIG_IGN)
# kill all child processes
os.killpg(pgid, signal.SIGTERM)
# restore the signal handler
signal.signal(signal.SIGTERM, old_signal_handler)
if _exitcode != 0:
# Try to read the exception from the child process
exc_info = {}
if os.path.exists(exc_file_path):
with contextlib.suppress(Exception), \
open(exc_file_path, 'rb') as f:
exc_info = cloudpickle.load(f)

if (original_exception :=
exc_info.get('pickled_exception')) is not None:
# Re-raise the actual exception object if it was
# successfully pickled.
assert isinstance(original_exception, Exception)
raise original_exception

if (original_tb := exc_info.get("traceback")) is not None:
# Use string-based traceback for fallback case
raise AssertionError(
f"Test {func.__name__} failed when called with"
f" args {args} and kwargs {kwargs}"
f" (exit code: {_exitcode}):\n{original_tb}"
) from None

# Fallback to the original generic error
raise AssertionError(
f"function {func.__name__} failed when called with"
f" args {args} and kwargs {kwargs}"
f" (exit code: {_exitcode})") from None

return wrapper

Expand Down