-
Notifications
You must be signed in to change notification settings - Fork 679
feat: Update test_vllm_health_check_active to make it stable #4200
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,7 +162,6 @@ def send_completion_request( | |
| @pytest.mark.gpu_1 | ||
| @pytest.mark.e2e | ||
| @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) | ||
| @pytest.mark.skip(reason="Flaky, temporarily disabled") | ||
| def test_vllm_health_check_active(request, runtime_services): | ||
| """ | ||
| End-to-end test for worker fault tolerance with migration support. | ||
|
|
@@ -192,25 +191,53 @@ def test_vllm_health_check_active(request, runtime_services): | |
| # Step 4: Find and kill vLLM engine processes to force the EngineDeadError condition. | ||
| children = worker.subprocesses() | ||
| logger.info(f"Worker children: {[child.pid for child in children]}") | ||
| vllm_child = None | ||
| for child in children: | ||
| cmdline = child.cmdline() | ||
| if len(cmdline) > 0 and cmdline[0] == "VLLM::EngineCore": | ||
| logger.warning( | ||
| f"Killing vLLM engine process {{ pid: {child.pid}, cmdline: '{' '.join(cmdline)}' }}" | ||
| ) | ||
| vllm_child = child | ||
| child.kill() | ||
| break | ||
|
|
||
| time.sleep(2) # Give some time for the worker to stabilize | ||
|
|
||
| # Step 5: Send a request triggering the handler to shutdown everything. | ||
| test_response = send_completion_request("How old are you?", 100, timeout=60) | ||
| logger.error(f"Test request failed: {test_response}") | ||
|
|
||
| # Step 6: Ensure the worker process has been stopped as a result of the EngineDeadError condition. | ||
| if worker.is_running(): | ||
| # Wait for the vLLM engine process to actually die | ||
| if vllm_child: | ||
| try: | ||
| vllm_child.wait(timeout=30) | ||
| logger.info(f"vLLM engine process {vllm_child.pid} has been terminated") | ||
| except Exception as e: | ||
| logger.error(f"Failed to wait for vLLM engine process to die: {e}") | ||
| pytest.fail(f"vLLM engine process did not terminate: {e}") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Improve exception handling per static analysis recommendations. The exception handling has two issues:
Apply this diff to address both issues: # Wait for the vLLM engine process to actually die
if vllm_child:
try:
vllm_child.wait(timeout=30)
logger.info(f"vLLM engine process {vllm_child.pid} has been terminated")
- except Exception as e:
- logger.error(f"Failed to wait for vLLM engine process to die: {e}")
+ except (psutil.TimeoutExpired, psutil.NoSuchProcess) as e:
+ logger.exception(f"Failed to wait for vLLM engine process to die: {e}")
pytest.fail(f"vLLM engine process did not terminate: {e}")Note: You'll need to ensure
🧰 Tools🪛 Ruff (0.14.3)210-210: Do not catch blind exception: (BLE001) 211-211: Use Replace with (TRY400) 🤖 Prompt for AI Agents |
||
|
|
||
| # Step 5: Send a request that should fail due to the dead engine. | ||
| try: | ||
| test_response = send_completion_request("How old are you?", 100, timeout=60) | ||
| # Verify the response indicates an error (non-2xx status code) | ||
| if test_response.status_code < 400: | ||
| pytest.fail( | ||
| f"Expected error response after killing vLLM engine, but got status {test_response.status_code}" | ||
| ) | ||
| logger.info(f"Request correctly failed with status {test_response.status_code}: {test_response.text}") | ||
| except requests.exceptions.RequestException as e: | ||
| # It's also acceptable for the request to raise an exception | ||
| logger.info(f"Request correctly failed with exception: {e}") | ||
|
|
||
| # Step 6: Wait for the worker process to stop as a result of the EngineDeadError condition. | ||
| max_wait = 60 # seconds | ||
| poll_interval = 0.5 # seconds | ||
| elapsed = 0 | ||
| while elapsed < max_wait: | ||
| if not worker.is_running(): | ||
| logger.info(f"Worker stopped after {elapsed:.1f} seconds") | ||
| break | ||
| time.sleep(poll_interval) | ||
| elapsed += poll_interval | ||
| else: | ||
| # Loop completed without break - worker still running | ||
| pytest.fail( | ||
| "Worker should not be running after killing vLLM engine process." | ||
| f"Worker should not be running after killing vLLM engine process (waited {max_wait}s)" | ||
| ) | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Verify that the vLLM engine process was found before proceeding.
If no child process with
cmdline[0] == "VLLM::EngineCore"is found,vllm_childremainsNoneand the test continues without actually killing the engine. This would make the test invalid since it wouldn't be testing the EngineDeadError condition.Add a check after the loop to ensure the engine process was found:
for child in children: cmdline = child.cmdline() if len(cmdline) > 0 and cmdline[0] == "VLLM::EngineCore": logger.warning( f"Killing vLLM engine process {{ pid: {child.pid}, cmdline: '{' '.join(cmdline)}' }}" ) vllm_child = child child.kill() break + + if vllm_child is None: + pytest.fail("Could not find VLLM::EngineCore process to kill")🤖 Prompt for AI Agents