Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/v1/distributed/test_dbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,5 +85,4 @@ def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
assert accuracy >= MIN_ACCURACY, (
f"DBO+DP+EP accuracy too low ({all2all_backend}): "
f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
f"(correct: {results['num_correct']}/{results['num_questions']})"
)
4 changes: 1 addition & 3 deletions vllm/v1/attention/backends/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,7 @@ def _make_metadata_with_slice(
assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], (
"Token slice start outside of first request"
)
assert start_locs[last_req] <= last_tok < start_locs[last_req + 1], (
"Token slice end outside of last request"
)
# NOTE: last token can be outside of the last request if we have CG padding.

# If the "middle" request has tokens in both ubatches, we have to split it.
# If ubatch_slice is the first ubatch then we will be splitting the last
Expand Down
9 changes: 6 additions & 3 deletions vllm/v1/worker/dp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,16 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch

# This just pads the second ubatch slice out to the total number of tokens
# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
def _pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
padded_second_ubatch_slice = slice(
def _pad_out_ubatch_slice(
ubatch_slices: UBatchSlices, num_total_tokens: int
) -> UBatchSlices:
padded_second_token_slice = slice(
ubatch_slices[1].token_slice.start, num_total_tokens
)
ubatch_slices[1] = UBatchSlice(
padded_second_ubatch_slice, padded_second_ubatch_slice
ubatch_slices[1].request_slice, padded_second_token_slice
)
return ubatch_slices


def _synchronize_dp_ranks(
Expand Down