|
1 | 1 | # SPDX-License-Identifier: Apache-2.0 |
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
3 | 3 |
|
| 4 | +import base64 |
| 5 | +import io |
4 | 6 | import warnings |
5 | 7 | from collections.abc import Mapping |
6 | 8 | from typing import Literal |
7 | 9 |
|
8 | 10 | import pytest |
| 11 | +import torch |
9 | 12 | from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy |
10 | 13 |
|
11 | 14 | from vllm.assets.audio import AudioAsset |
@@ -987,6 +990,203 @@ async def test_parse_chat_messages_audio_embeds_async( |
987 | 990 | _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) |
988 | 991 |
|
989 | 992 |
|
| 993 | +def test_parse_chat_messages_multiple_image_embeds( |
| 994 | + phi3v_model_config_image_embeds, |
| 995 | + phi3v_tokenizer, |
| 996 | +): |
| 997 | + """Test that multiple image_embeds in a single message are now supported. |
| 998 | +
|
| 999 | + This test validates the fix for the limitation that previously only allowed |
| 1000 | + one message with {'type': 'image_embeds'}. Now multiple image embeddings |
| 1001 | + can be provided in a single request, similar to regular images. |
| 1002 | + """ |
| 1003 | + # Create two sample image embedding tensors |
| 1004 | + image_embedding_1 = torch.randn(256, 1024) |
| 1005 | + image_embedding_2 = torch.randn(128, 1024) |
| 1006 | + |
| 1007 | + # Encode them as base64 |
| 1008 | + def encode_embedding(embedding): |
| 1009 | + buffer = io.BytesIO() |
| 1010 | + torch.save(embedding, buffer) |
| 1011 | + buffer.seek(0) |
| 1012 | + binary_data = buffer.read() |
| 1013 | + return base64.b64encode(binary_data).decode("utf-8") |
| 1014 | + |
| 1015 | + base64_image_embedding_1 = encode_embedding(image_embedding_1) |
| 1016 | + base64_image_embedding_2 = encode_embedding(image_embedding_2) |
| 1017 | + |
| 1018 | + conversation, mm_data, mm_uuids = parse_chat_messages( |
| 1019 | + [ |
| 1020 | + { |
| 1021 | + "role": "user", |
| 1022 | + "content": [ |
| 1023 | + { |
| 1024 | + "type": "image_embeds", |
| 1025 | + "image_embeds": base64_image_embedding_1, |
| 1026 | + }, |
| 1027 | + { |
| 1028 | + "type": "image_embeds", |
| 1029 | + "image_embeds": base64_image_embedding_2, |
| 1030 | + }, |
| 1031 | + {"type": "text", "text": "Describe these two images."}, |
| 1032 | + ], |
| 1033 | + } |
| 1034 | + ], |
| 1035 | + phi3v_model_config_image_embeds, |
| 1036 | + phi3v_tokenizer, |
| 1037 | + content_format="string", |
| 1038 | + ) |
| 1039 | + |
| 1040 | + # Verify conversation structure |
| 1041 | + assert conversation == [ |
| 1042 | + { |
| 1043 | + "role": "user", |
| 1044 | + "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.", |
| 1045 | + } |
| 1046 | + ] |
| 1047 | + |
| 1048 | + # Verify mm_data contains a list of embeddings (not a single embedding) |
| 1049 | + assert mm_data is not None |
| 1050 | + assert "image" in mm_data |
| 1051 | + assert isinstance(mm_data["image"], list) |
| 1052 | + assert len(mm_data["image"]) == 2 |
| 1053 | + |
| 1054 | + # Verify each embedding has the correct shape |
| 1055 | + assert isinstance(mm_data["image"][0], torch.Tensor) |
| 1056 | + assert mm_data["image"][0].shape == image_embedding_1.shape |
| 1057 | + assert isinstance(mm_data["image"][1], torch.Tensor) |
| 1058 | + assert mm_data["image"][1].shape == image_embedding_2.shape |
| 1059 | + |
| 1060 | + # Verify UUIDs (None since we didn't provide any) |
| 1061 | + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) |
| 1062 | + |
| 1063 | + |
| 1064 | +def test_parse_chat_messages_multiple_image_embeds_with_uuids( |
| 1065 | + phi3v_model_config_image_embeds, |
| 1066 | + phi3v_tokenizer, |
| 1067 | +): |
| 1068 | + """Test multiple image_embeds with UUIDs. |
| 1069 | +
|
| 1070 | + This validates that UUIDs are properly tracked for multiple embeddings. |
| 1071 | + """ |
| 1072 | + uuid1 = "image-uuid-1" |
| 1073 | + uuid2 = "image-uuid-2" |
| 1074 | + |
| 1075 | + conversation, mm_data, mm_uuids = parse_chat_messages( |
| 1076 | + [ |
| 1077 | + { |
| 1078 | + "role": "user", |
| 1079 | + "content": [ |
| 1080 | + { |
| 1081 | + "type": "image_embeds", |
| 1082 | + "image_embeds": None, |
| 1083 | + "uuid": uuid1, |
| 1084 | + }, |
| 1085 | + { |
| 1086 | + "type": "image_embeds", |
| 1087 | + "image_embeds": None, |
| 1088 | + "uuid": uuid2, |
| 1089 | + }, |
| 1090 | + {"type": "text", "text": "Compare these images."}, |
| 1091 | + ], |
| 1092 | + } |
| 1093 | + ], |
| 1094 | + phi3v_model_config_image_embeds, |
| 1095 | + phi3v_tokenizer, |
| 1096 | + content_format="string", |
| 1097 | + ) |
| 1098 | + |
| 1099 | + # Verify conversation structure |
| 1100 | + assert conversation == [ |
| 1101 | + { |
| 1102 | + "role": "user", |
| 1103 | + "content": "<|image_1|>\n<|image_2|>\nCompare these images.", |
| 1104 | + } |
| 1105 | + ] |
| 1106 | + |
| 1107 | + # Verify mm_data contains a list with None values (UUID references) |
| 1108 | + assert mm_data is not None |
| 1109 | + assert "image" in mm_data |
| 1110 | + assert isinstance(mm_data["image"], list) |
| 1111 | + assert len(mm_data["image"]) == 2 |
| 1112 | + assert mm_data["image"][0] is None |
| 1113 | + assert mm_data["image"][1] is None |
| 1114 | + |
| 1115 | + # Verify UUIDs are correctly tracked |
| 1116 | + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2]) |
| 1117 | + |
| 1118 | + |
| 1119 | +@pytest.mark.asyncio |
| 1120 | +async def test_parse_chat_messages_multiple_image_embeds_async( |
| 1121 | + phi3v_model_config_image_embeds, |
| 1122 | + phi3v_tokenizer, |
| 1123 | +): |
| 1124 | + """Test multiple image_embeds with async parsing. |
| 1125 | +
|
| 1126 | + This validates the AsyncMultiModalItemTracker also supports multiple embeddings. |
| 1127 | + """ |
| 1128 | + # Create two sample image embedding tensors |
| 1129 | + image_embedding_1 = torch.randn(200, 768) |
| 1130 | + image_embedding_2 = torch.randn(150, 768) |
| 1131 | + |
| 1132 | + # Encode them as base64 |
| 1133 | + def encode_embedding(embedding): |
| 1134 | + buffer = io.BytesIO() |
| 1135 | + torch.save(embedding, buffer) |
| 1136 | + buffer.seek(0) |
| 1137 | + binary_data = buffer.read() |
| 1138 | + return base64.b64encode(binary_data).decode("utf-8") |
| 1139 | + |
| 1140 | + base64_image_embedding_1 = encode_embedding(image_embedding_1) |
| 1141 | + base64_image_embedding_2 = encode_embedding(image_embedding_2) |
| 1142 | + |
| 1143 | + conversation, mm_future, mm_uuids = parse_chat_messages_futures( |
| 1144 | + [ |
| 1145 | + { |
| 1146 | + "role": "user", |
| 1147 | + "content": [ |
| 1148 | + { |
| 1149 | + "type": "image_embeds", |
| 1150 | + "image_embeds": base64_image_embedding_1, |
| 1151 | + }, |
| 1152 | + { |
| 1153 | + "type": "image_embeds", |
| 1154 | + "image_embeds": base64_image_embedding_2, |
| 1155 | + }, |
| 1156 | + {"type": "text", "text": "What do these images show?"}, |
| 1157 | + ], |
| 1158 | + } |
| 1159 | + ], |
| 1160 | + phi3v_model_config_image_embeds, |
| 1161 | + phi3v_tokenizer, |
| 1162 | + content_format="string", |
| 1163 | + ) |
| 1164 | + |
| 1165 | + # Verify conversation structure |
| 1166 | + assert conversation == [ |
| 1167 | + { |
| 1168 | + "role": "user", |
| 1169 | + "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?", |
| 1170 | + } |
| 1171 | + ] |
| 1172 | + |
| 1173 | + # Await the future and verify mm_data |
| 1174 | + mm_data = await mm_future |
| 1175 | + assert mm_data is not None |
| 1176 | + assert "image" in mm_data |
| 1177 | + assert isinstance(mm_data["image"], list) |
| 1178 | + assert len(mm_data["image"]) == 2 |
| 1179 | + |
| 1180 | + # Verify each embedding has the correct shape |
| 1181 | + assert isinstance(mm_data["image"][0], torch.Tensor) |
| 1182 | + assert mm_data["image"][0].shape == image_embedding_1.shape |
| 1183 | + assert isinstance(mm_data["image"][1], torch.Tensor) |
| 1184 | + assert mm_data["image"][1].shape == image_embedding_2.shape |
| 1185 | + |
| 1186 | + # Verify UUIDs |
| 1187 | + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) |
| 1188 | + |
| 1189 | + |
990 | 1190 | @pytest.mark.asyncio |
991 | 1191 | async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( |
992 | 1192 | phi3v_model_config_image_embeds, |
|
0 commit comments