Skip to content

Commit 00f167a

Browse files
committed
[CI] Fail subprocess tests with root-cause error
Signed-off-by: Nick Hill <[email protected]>
1 parent a11adaf commit 00f167a

File tree

4 files changed

+98
-28
lines changed

4 files changed

+98
-28
lines changed

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
2121
sentence-transformers # required for embedding tests
2222
soundfile # required for audio tests
2323
jiwer # required for audio tests
24+
tblib # for pickling test exceptions
2425
timm >=1.0.17 # required for internvl and gemma3n-mm test
2526
torch==2.7.1
2627
torchaudio==2.7.1

requirements/test.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ contourpy==1.3.0
137137
# via matplotlib
138138
cramjam==2.9.0
139139
# via fastparquet
140-
cupy-cuda12x==13.3.0
140+
cupy-cuda12x==13.6.0
141141
# via ray
142142
cycler==0.12.1
143143
# via matplotlib
@@ -1032,6 +1032,8 @@ tabledata==1.3.3
10321032
# via pytablewriter
10331033
tabulate==0.9.0
10341034
# via sacrebleu
1035+
tblib==3.1.0
1036+
# via -r requirements/test.in
10351037
tcolorpy==0.1.6
10361038
# via pytablewriter
10371039
tenacity==9.0.0

tests/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
# ruff: noqa
5+
6+
from tblib import pickling_support
7+
8+
# Install support for pickling exceptions so that we can nicely propagate
9+
# failures from tests running in a subprocess.
10+
# This should be run before any custom exception subclasses are defined.
11+
pickling_support.install()
12+
313
import json
414
import os
515
import tempfile

tests/utils.py

Lines changed: 84 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

44
import asyncio
5+
import contextlib
56
import copy
67
import functools
78
import importlib
@@ -799,43 +800,99 @@ def wait_for_gpu_memory_to_clear(*,
799800

800801

801802
def fork_new_process_for_each_test(
802-
f: Callable[_P, None]) -> Callable[_P, None]:
803+
func: Callable[_P, None]) -> Callable[_P, None]:
803804
"""Decorator to fork a new process for each test function.
804805
See https://github.com/vllm-project/vllm/issues/7053 for more details.
805806
"""
806807

807-
@functools.wraps(f)
808+
@functools.wraps(func)
808809
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
809810
# Make the process the leader of its own process group
810811
# to avoid sending SIGTERM to the parent process
811812
os.setpgrp()
812813
from _pytest.outcomes import Skipped
813-
pid = os.fork()
814-
print(f"Fork a new process to run a test {pid}")
815-
if pid == 0:
816-
try:
817-
f(*args, **kwargs)
818-
except Skipped as e:
819-
# convert Skipped to exit code 0
820-
print(str(e))
821-
os._exit(0)
822-
except Exception:
823-
import traceback
824-
traceback.print_exc()
825-
os._exit(1)
814+
815+
# Create a unique temporary file to store exception info from child
816+
# process. Use test function name and process ID to avoid collisions.
817+
with tempfile.NamedTemporaryFile(
818+
mode='w+b',
819+
prefix=f"vllm_test_{func.__name__}_{os.getpid()}_",
820+
suffix=".exc") as exc_file:
821+
exc_file_path = exc_file.name
822+
823+
pid = os.fork()
824+
print(f"Fork a new process to run a test {pid}")
825+
if pid == 0:
826+
try:
827+
func(*args, **kwargs)
828+
except Skipped as e:
829+
# convert Skipped to exit code 0
830+
print(str(e))
831+
os._exit(0)
832+
except Exception as e:
833+
import traceback
834+
tb_string = traceback.format_exc()
835+
836+
# Try to serialize the exception object first
837+
try:
838+
# First, try to pickle the actual exception with
839+
# its traceback.
840+
exc_to_serialize = {'pickled_exception': e}
841+
# Test if it can be pickled
842+
cloudpickle.dumps(exc_to_serialize)
843+
except Exception:
844+
# Fall back to string-based approach
845+
exc_to_serialize = {
846+
'exception_type': type(e).__name__,
847+
'exception_msg': str(e),
848+
'traceback': tb_string,
849+
}
850+
try:
851+
with open(exc_file_path, 'wb') as f:
852+
cloudpickle.dump(exc_to_serialize, f)
853+
except Exception:
854+
# Fallback: just print the traceback.
855+
traceback.print_exc()
856+
os._exit(1)
857+
else:
858+
os._exit(0)
826859
else:
827-
os._exit(0)
828-
else:
829-
pgid = os.getpgid(pid)
830-
_pid, _exitcode = os.waitpid(pid, 0)
831-
# ignore SIGTERM signal itself
832-
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
833-
# kill all child processes
834-
os.killpg(pgid, signal.SIGTERM)
835-
# restore the signal handler
836-
signal.signal(signal.SIGTERM, old_signal_handler)
837-
assert _exitcode == 0, (f"function {f} failed when called with"
838-
f" args {args} and kwargs {kwargs}")
860+
pgid = os.getpgid(pid)
861+
_pid, _exitcode = os.waitpid(pid, 0)
862+
# ignore SIGTERM signal itself
863+
old_signal_handler = signal.signal(signal.SIGTERM,
864+
signal.SIG_IGN)
865+
# kill all child processes
866+
os.killpg(pgid, signal.SIGTERM)
867+
# restore the signal handler
868+
signal.signal(signal.SIGTERM, old_signal_handler)
869+
if _exitcode != 0:
870+
# Try to read the exception from the child process
871+
exc_info = {}
872+
if os.path.exists(exc_file_path):
873+
with contextlib.suppress(Exception), \
874+
open(exc_file_path, 'rb') as f:
875+
exc_info = cloudpickle.load(f)
876+
877+
if (original_exception := exc_info.get(
878+
'pickled_exception')) is not None:
879+
# Re-raise the actual exception object if it was
880+
# successfully pickled.
881+
assert isinstance(original_exception, Exception)
882+
raise original_exception
883+
884+
if (original_tb := exc_info.get("traceback")) is not None:
885+
# Use string-based traceback for fallback case
886+
raise AssertionError(
887+
f"Test {func.__name__} failed when called with"
888+
f" args {args} and kwargs {kwargs}"
889+
f" (exit code: {_exitcode}):\n{original_tb}")
890+
else:
891+
# Fallback to the original generic error
892+
raise AssertionError(
893+
f"function {func.__name__} failed when called with"
894+
f" args {args} and kwargs {kwargs}"
895+
f" (exit code: {_exitcode})")
839896

840897
return wrapper
841898

0 commit comments

Comments
 (0)