|
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
3 | 3 |
|
4 | 4 | import asyncio |
| 5 | +import contextlib |
5 | 6 | import copy |
6 | 7 | import functools |
7 | 8 | import importlib |
@@ -799,43 +800,99 @@ def wait_for_gpu_memory_to_clear(*, |
799 | 800 |
|
800 | 801 |
|
801 | 802 | def fork_new_process_for_each_test( |
802 | | - f: Callable[_P, None]) -> Callable[_P, None]: |
| 803 | + func: Callable[_P, None]) -> Callable[_P, None]: |
803 | 804 | """Decorator to fork a new process for each test function. |
804 | 805 | See https://github.com/vllm-project/vllm/issues/7053 for more details. |
805 | 806 | """ |
806 | 807 |
|
807 | | - @functools.wraps(f) |
| 808 | + @functools.wraps(func) |
808 | 809 | def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: |
809 | 810 | # Make the process the leader of its own process group |
810 | 811 | # to avoid sending SIGTERM to the parent process |
811 | 812 | os.setpgrp() |
812 | 813 | from _pytest.outcomes import Skipped |
813 | | - pid = os.fork() |
814 | | - print(f"Fork a new process to run a test {pid}") |
815 | | - if pid == 0: |
816 | | - try: |
817 | | - f(*args, **kwargs) |
818 | | - except Skipped as e: |
819 | | - # convert Skipped to exit code 0 |
820 | | - print(str(e)) |
821 | | - os._exit(0) |
822 | | - except Exception: |
823 | | - import traceback |
824 | | - traceback.print_exc() |
825 | | - os._exit(1) |
| 814 | + |
| 815 | + # Create a unique temporary file to store exception info from child |
| 816 | + # process. Use test function name and process ID to avoid collisions. |
| 817 | + with tempfile.NamedTemporaryFile( |
| 818 | + mode='w+b', |
| 819 | + prefix=f"vllm_test_{func.__name__}_{os.getpid()}_", |
| 820 | + suffix=".exc") as exc_file: |
| 821 | + exc_file_path = exc_file.name |
| 822 | + |
| 823 | + pid = os.fork() |
| 824 | + print(f"Fork a new process to run a test {pid}") |
| 825 | + if pid == 0: |
| 826 | + try: |
| 827 | + func(*args, **kwargs) |
| 828 | + except Skipped as e: |
| 829 | + # convert Skipped to exit code 0 |
| 830 | + print(str(e)) |
| 831 | + os._exit(0) |
| 832 | + except Exception as e: |
| 833 | + import traceback |
| 834 | + tb_string = traceback.format_exc() |
| 835 | + |
| 836 | + # Try to serialize the exception object first |
| 837 | + try: |
| 838 | + # First, try to pickle the actual exception with |
| 839 | + # its traceback. |
| 840 | + exc_to_serialize = {'pickled_exception': e} |
| 841 | + # Test if it can be pickled |
| 842 | + cloudpickle.dumps(exc_to_serialize) |
| 843 | + except Exception: |
| 844 | + # Fall back to string-based approach |
| 845 | + exc_to_serialize = { |
| 846 | + 'exception_type': type(e).__name__, |
| 847 | + 'exception_msg': str(e), |
| 848 | + 'traceback': tb_string, |
| 849 | + } |
| 850 | + try: |
| 851 | + with open(exc_file_path, 'wb') as f: |
| 852 | + cloudpickle.dump(exc_to_serialize, f) |
| 853 | + except Exception: |
| 854 | + # Fallback: just print the traceback. |
| 855 | + traceback.print_exc() |
| 856 | + os._exit(1) |
| 857 | + else: |
| 858 | + os._exit(0) |
826 | 859 | else: |
827 | | - os._exit(0) |
828 | | - else: |
829 | | - pgid = os.getpgid(pid) |
830 | | - _pid, _exitcode = os.waitpid(pid, 0) |
831 | | - # ignore SIGTERM signal itself |
832 | | - old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) |
833 | | - # kill all child processes |
834 | | - os.killpg(pgid, signal.SIGTERM) |
835 | | - # restore the signal handler |
836 | | - signal.signal(signal.SIGTERM, old_signal_handler) |
837 | | - assert _exitcode == 0, (f"function {f} failed when called with" |
838 | | - f" args {args} and kwargs {kwargs}") |
| 860 | + pgid = os.getpgid(pid) |
| 861 | + _pid, _exitcode = os.waitpid(pid, 0) |
| 862 | + # ignore SIGTERM signal itself |
| 863 | + old_signal_handler = signal.signal(signal.SIGTERM, |
| 864 | + signal.SIG_IGN) |
| 865 | + # kill all child processes |
| 866 | + os.killpg(pgid, signal.SIGTERM) |
| 867 | + # restore the signal handler |
| 868 | + signal.signal(signal.SIGTERM, old_signal_handler) |
| 869 | + if _exitcode != 0: |
| 870 | + # Try to read the exception from the child process |
| 871 | + exc_info = {} |
| 872 | + if os.path.exists(exc_file_path): |
| 873 | + with contextlib.suppress(Exception), \ |
| 874 | + open(exc_file_path, 'rb') as f: |
| 875 | + exc_info = cloudpickle.load(f) |
| 876 | + |
| 877 | + if (original_exception := exc_info.get( |
| 878 | + 'pickled_exception')) is not None: |
| 879 | + # Re-raise the actual exception object if it was |
| 880 | + # successfully pickled. |
| 881 | + assert isinstance(original_exception, Exception) |
| 882 | + raise original_exception |
| 883 | + |
| 884 | + if (original_tb := exc_info.get("traceback")) is not None: |
| 885 | + # Use string-based traceback for fallback case |
| 886 | + raise AssertionError( |
| 887 | + f"Test {func.__name__} failed when called with" |
| 888 | + f" args {args} and kwargs {kwargs}" |
| 889 | + f" (exit code: {_exitcode}):\n{original_tb}") |
| 890 | + else: |
| 891 | + # Fallback to the original generic error |
| 892 | + raise AssertionError( |
| 893 | + f"function {func.__name__} failed when called with" |
| 894 | + f" args {args} and kwargs {kwargs}" |
| 895 | + f" (exit code: {_exitcode})") |
839 | 896 |
|
840 | 897 | return wrapper |
841 | 898 |
|
|
0 commit comments