feat: Support multiple image/audio embeddings per request in Chat Completions API

Jeremy Teboul · Jeremy Teboul · commit c4e242c86de0 · 2025-12-03T07:20:29.000-08:00
- Remove limitation that only allowed one message with image_embeds/audio_embeds
- Update MultiModalItemTracker and AsyncMultiModalItemTracker to treat embeddings as lists
- Add unit tests for multiple image embeddings support:
  * test_parse_chat_messages_multiple_image_embeds
  * test_parse_chat_messages_multiple_image_embeds_with_uuids
  * test_parse_chat_messages_multiple_image_embeds_async
- Embeddings now behave consistently with regular images/audios
- Validation via existing validate_num_items() against --limit-mm-per-prompt
- Backward compatible with single embeddings

This enables the Chat Completions API to leverage the model's existing
capability for multiple embeddings, previously only accessible through
the direct LLM inference API.
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import base64
+import io
 import warnings
 from collections.abc import Mapping
 from typing import Literal
 
 import pytest
+import torch
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
 from vllm.assets.audio import AudioAsset
@@ -987,6 +990,203 @@ async def test_parse_chat_messages_audio_embeds_async(
     _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
+def test_parse_chat_messages_multiple_image_embeds(
+    phi3v_model_config_image_embeds,
+    phi3v_tokenizer,
+):
+    """Test that multiple image_embeds in a single message are now supported.
+
+    This test validates the fix for the limitation that previously only allowed
+    one message with {'type': 'image_embeds'}. Now multiple image embeddings
+    can be provided in a single request, similar to regular images.
+    """
+    # Create two sample image embedding tensors
+    image_embedding_1 = torch.randn(256, 1024)
+    image_embedding_2 = torch.randn(128, 1024)
+
+    # Encode them as base64
+    def encode_embedding(embedding):
+        buffer = io.BytesIO()
+        torch.save(embedding, buffer)
+        buffer.seek(0)
+        binary_data = buffer.read()
+        return base64.b64encode(binary_data).decode("utf-8")
+
+    base64_image_embedding_1 = encode_embedding(image_embedding_1)
+    base64_image_embedding_2 = encode_embedding(image_embedding_2)
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_2,
+                    },
+                    {"type": "text", "text": "Describe these two images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
+        }
+    ]
+
+    # Verify mm_data contains a list of embeddings (not a single embedding)
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"][0], torch.Tensor)
+    assert mm_data["image"][0].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"][1], torch.Tensor)
+    assert mm_data["image"][1].shape == image_embedding_2.shape
+
+    # Verify UUIDs (None since we didn't provide any)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_image_embeds_with_uuids(
+    phi3v_model_config_image_embeds,
+    phi3v_tokenizer,
+):
+    """Test multiple image_embeds with UUIDs.
+
+    This validates that UUIDs are properly tracked for multiple embeddings.
+    """
+    uuid1 = "image-uuid-1"
+    uuid2 = "image-uuid-2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": None,
+                        "uuid": uuid1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": None,
+                        "uuid": uuid2,
+                    },
+                    {"type": "text", "text": "Compare these images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nCompare these images.",
+        }
+    ]
+
+    # Verify mm_data contains a list with None values (UUID references)
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+    assert mm_data["image"][0] is None
+    assert mm_data["image"][1] is None
+
+    # Verify UUIDs are correctly tracked
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_image_embeds_async(
+    phi3v_model_config_image_embeds,
+    phi3v_tokenizer,
+):
+    """Test multiple image_embeds with async parsing.
+
+    This validates the AsyncMultiModalItemTracker also supports multiple embeddings.
+    """
+    # Create two sample image embedding tensors
+    image_embedding_1 = torch.randn(200, 768)
+    image_embedding_2 = torch.randn(150, 768)
+
+    # Encode them as base64
+    def encode_embedding(embedding):
+        buffer = io.BytesIO()
+        torch.save(embedding, buffer)
+        buffer.seek(0)
+        binary_data = buffer.read()
+        return base64.b64encode(binary_data).decode("utf-8")
+
+    base64_image_embedding_1 = encode_embedding(image_embedding_1)
+    base64_image_embedding_2 = encode_embedding(image_embedding_2)
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_1,
+                    },
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": base64_image_embedding_2,
+                    },
+                    {"type": "text", "text": "What do these images show?"},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?",
+        }
+    ]
+
+    # Await the future and verify mm_data
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 2
+
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"][0], torch.Tensor)
+    assert mm_data["image"][0].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"][1], torch.Tensor)
+    assert mm_data["image"][1].shape == image_embedding_2.shape
+
+    # Verify UUIDs
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -695,16 +695,10 @@ def all_mm_uuids(self) -> MultiModalUUIDDict | None:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
 
         if "image_embeds" in uuids_by_modality:
-            image_embeds_uuids = uuids_by_modality["image_embeds"]
-            if len(image_embeds_uuids) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
         if "audio_embeds" in uuids_by_modality:
-            audio_embeds_uuids = uuids_by_modality["audio_embeds"]
-            if len(audio_embeds_uuids) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
             mm_uuids["audio"] = uuids_by_modality["audio_embeds"]
         if "audio" in uuids_by_modality:
             mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
@@ -730,16 +724,12 @@ def all_mm_data(self) -> MultiModalDataDict | None:
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
-            if len(image_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
-            mm_inputs["image"] = image_embeds_lst[0]
+            mm_inputs["image"] = image_embeds_lst
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
             audio_embeds_lst = items_by_modality["audio_embeds"]
-            if len(audio_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
-            mm_inputs["audio"] = audio_embeds_lst[0]
+            mm_inputs["audio"] = audio_embeds_lst
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -772,16 +762,12 @@ async def all_mm_data(self) -> MultiModalDataDict | None:
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
-            if len(image_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'image_embeds'}")
-            mm_inputs["image"] = image_embeds_lst[0]
+            mm_inputs["image"] = image_embeds_lst
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
         if "audio_embeds" in items_by_modality:
             audio_embeds_lst = items_by_modality["audio_embeds"]
-            if len(audio_embeds_lst) > 1:
-                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
-            mm_inputs["audio"] = audio_embeds_lst[0]
+            mm_inputs["audio"] = audio_embeds_lst
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality: