Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 37 additions & 10 deletions tests/fault_tolerance/test_vllm_health_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def send_completion_request(
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
@pytest.mark.skip(reason="Flaky, temporarily disabled")
def test_vllm_health_check_active(request, runtime_services):
"""
End-to-end test for worker fault tolerance with migration support.
Expand Down Expand Up @@ -192,25 +191,53 @@ def test_vllm_health_check_active(request, runtime_services):
# Step 4: Find and kill vLLM engine processes to force the EngineDeadError condition.
children = worker.subprocesses()
logger.info(f"Worker children: {[child.pid for child in children]}")
vllm_child = None
for child in children:
cmdline = child.cmdline()
if len(cmdline) > 0 and cmdline[0] == "VLLM::EngineCore":
logger.warning(
f"Killing vLLM engine process {{ pid: {child.pid}, cmdline: '{' '.join(cmdline)}' }}"
)
vllm_child = child
child.kill()
break
Comment on lines +194 to 203
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Verify that the vLLM engine process was found before proceeding.

If no child process with cmdline[0] == "VLLM::EngineCore" is found, vllm_child remains None and the test continues without actually killing the engine. This would make the test invalid since it wouldn't be testing the EngineDeadError condition.

Add a check after the loop to ensure the engine process was found:

             for child in children:
                 cmdline = child.cmdline()
                 if len(cmdline) > 0 and cmdline[0] == "VLLM::EngineCore":
                     logger.warning(
                         f"Killing vLLM engine process {{ pid: {child.pid}, cmdline: '{' '.join(cmdline)}' }}"
                     )
                     vllm_child = child
                     child.kill()
                     break
+
+            if vllm_child is None:
+                pytest.fail("Could not find VLLM::EngineCore process to kill")
🤖 Prompt for AI Agents
In tests/fault_tolerance/test_vllm_health_check.py around lines 194 to 203, the
loop may leave vllm_child as None if no child process matches
"VLLM::EngineCore"; add a check immediately after the loop that fails the test
if vllm_child is still None (e.g., assert vllm_child is not None or call
pytest.fail with a clear message) so the test does not continue without actually
killing the engine and invalidating the EngineDeadError scenario.


time.sleep(2) # Give some time for the worker to stabilize

# Step 5: Send a request triggering the handler to shutdown everything.
test_response = send_completion_request("How old are you?", 100, timeout=60)
logger.error(f"Test request failed: {test_response}")

# Step 6: Ensure the worker process has been stopped as a result of the EngineDeadError condition.
if worker.is_running():
# Wait for the vLLM engine process to actually die
if vllm_child:
try:
vllm_child.wait(timeout=30)
logger.info(f"vLLM engine process {vllm_child.pid} has been terminated")
except Exception as e:
logger.error(f"Failed to wait for vLLM engine process to die: {e}")
pytest.fail(f"vLLM engine process did not terminate: {e}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Improve exception handling per static analysis recommendations.

The exception handling has two issues:

  1. Catching blind Exception is too broad and can mask unexpected errors.
  2. Using logging.error instead of logging.exception loses the traceback information.

Apply this diff to address both issues:

             # Wait for the vLLM engine process to actually die
             if vllm_child:
                 try:
                     vllm_child.wait(timeout=30)
                     logger.info(f"vLLM engine process {vllm_child.pid} has been terminated")
-                except Exception as e:
-                    logger.error(f"Failed to wait for vLLM engine process to die: {e}")
+                except (psutil.TimeoutExpired, psutil.NoSuchProcess) as e:
+                    logger.exception(f"Failed to wait for vLLM engine process to die: {e}")
                     pytest.fail(f"vLLM engine process did not terminate: {e}")

Note: You'll need to ensure psutil is imported if not already available. Alternatively, if using psutil.Process.wait(), catching psutil.TimeoutExpired is appropriate.

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.14.3)

210-210: Do not catch blind exception: Exception

(BLE001)


211-211: Use logging.exception instead of logging.error

Replace with exception

(TRY400)

🤖 Prompt for AI Agents
In tests/fault_tolerance/test_vllm_health_check.py around lines 205 to 212, the
try/except is catching a broad Exception and logging without traceback; change
it to catch the specific timeout exception (use subprocess.TimeoutExpired or
psutil.TimeoutExpired depending on whether vllm_child is a subprocess.Popen or
psutil.Process) and handle other unexpected errors by re-raising; replace
logger.error with logger.exception when logging the timeout failure so the
traceback is captured; if you decide to use psutil.TimeoutExpired, ensure psutil
is imported at the top of the file.


# Step 5: Send a request that should fail due to the dead engine.
try:
test_response = send_completion_request("How old are you?", 100, timeout=60)
# Verify the response indicates an error (non-2xx status code)
if test_response.status_code < 400:
pytest.fail(
f"Expected error response after killing vLLM engine, but got status {test_response.status_code}"
)
logger.info(f"Request correctly failed with status {test_response.status_code}: {test_response.text}")
except requests.exceptions.RequestException as e:
# It's also acceptable for the request to raise an exception
logger.info(f"Request correctly failed with exception: {e}")

# Step 6: Wait for the worker process to stop as a result of the EngineDeadError condition.
max_wait = 60 # seconds
poll_interval = 0.5 # seconds
elapsed = 0
while elapsed < max_wait:
if not worker.is_running():
logger.info(f"Worker stopped after {elapsed:.1f} seconds")
break
time.sleep(poll_interval)
elapsed += poll_interval
else:
# Loop completed without break - worker still running
pytest.fail(
"Worker should not be running after killing vLLM engine process."
f"Worker should not be running after killing vLLM engine process (waited {max_wait}s)"
)


Expand Down
Loading