|
6 | 6 | from typing import Literal |
7 | 7 |
|
8 | 8 | import pytest |
| 9 | +import torch |
9 | 10 | from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy |
10 | 11 |
|
11 | 12 | from vllm.assets.audio import AudioAsset |
@@ -915,6 +916,189 @@ async def test_parse_chat_messages_audio_embeds_async( |
915 | 916 | _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) |
916 | 917 |
|
917 | 918 |
|
| 919 | +def test_parse_chat_messages_multiple_image_embeds( |
| 920 | + phi3v_model_config_image_embeds, |
| 921 | + phi3v_tokenizer, |
| 922 | +): |
| 923 | + """Test that multiple image_embeds in a single message are now supported. |
| 924 | +
|
| 925 | + This test validates the fix for the limitation that previously only allowed |
| 926 | + one message with {'type': 'image_embeds'}. Now multiple image embeddings |
| 927 | + can be provided in a single request, similar to regular images. |
| 928 | + """ |
| 929 | + # Create two sample image embedding tensors |
| 930 | + image_embedding_1 = torch.randn(256, 1024) |
| 931 | + image_embedding_2 = torch.randn(128, 1024) |
| 932 | + |
| 933 | + # Encode them as base64 using the convenience function |
| 934 | + base64_image_embedding_1 = tensor2base64(image_embedding_1) |
| 935 | + base64_image_embedding_2 = tensor2base64(image_embedding_2) |
| 936 | + |
| 937 | + conversation, mm_data, mm_uuids = parse_chat_messages( |
| 938 | + [ |
| 939 | + { |
| 940 | + "role": "user", |
| 941 | + "content": [ |
| 942 | + { |
| 943 | + "type": "image_embeds", |
| 944 | + "image_embeds": base64_image_embedding_1, |
| 945 | + }, |
| 946 | + { |
| 947 | + "type": "image_embeds", |
| 948 | + "image_embeds": base64_image_embedding_2, |
| 949 | + }, |
| 950 | + {"type": "text", "text": "Describe these two images."}, |
| 951 | + ], |
| 952 | + } |
| 953 | + ], |
| 954 | + phi3v_model_config_image_embeds, |
| 955 | + phi3v_tokenizer, |
| 956 | + content_format="string", |
| 957 | + ) |
| 958 | + |
| 959 | + # Verify conversation structure |
| 960 | + assert conversation == [ |
| 961 | + { |
| 962 | + "role": "user", |
| 963 | + "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.", |
| 964 | + } |
| 965 | + ] |
| 966 | + |
| 967 | + # Verify mm_data contains a list of embeddings (not a single embedding) |
| 968 | + assert mm_data is not None |
| 969 | + assert "image" in mm_data |
| 970 | + assert isinstance(mm_data["image"], list) |
| 971 | + assert len(mm_data["image"]) == 2 |
| 972 | + |
| 973 | + # Verify each embedding has the correct shape |
| 974 | + assert isinstance(mm_data["image"][0], torch.Tensor) |
| 975 | + assert mm_data["image"][0].shape == image_embedding_1.shape |
| 976 | + assert isinstance(mm_data["image"][1], torch.Tensor) |
| 977 | + assert mm_data["image"][1].shape == image_embedding_2.shape |
| 978 | + |
| 979 | + # Verify UUIDs (None since we didn't provide any) |
| 980 | + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) |
| 981 | + |
| 982 | + |
| 983 | +def test_parse_chat_messages_multiple_image_embeds_with_uuids( |
| 984 | + phi3v_model_config_image_embeds, |
| 985 | + phi3v_tokenizer, |
| 986 | +): |
| 987 | + """Test multiple image_embeds with UUIDs. |
| 988 | +
|
| 989 | + This validates that UUIDs are properly tracked for multiple embeddings. |
| 990 | + """ |
| 991 | + uuid1 = "image-uuid-1" |
| 992 | + uuid2 = "image-uuid-2" |
| 993 | + |
| 994 | + conversation, mm_data, mm_uuids = parse_chat_messages( |
| 995 | + [ |
| 996 | + { |
| 997 | + "role": "user", |
| 998 | + "content": [ |
| 999 | + { |
| 1000 | + "type": "image_embeds", |
| 1001 | + "image_embeds": None, |
| 1002 | + "uuid": uuid1, |
| 1003 | + }, |
| 1004 | + { |
| 1005 | + "type": "image_embeds", |
| 1006 | + "image_embeds": None, |
| 1007 | + "uuid": uuid2, |
| 1008 | + }, |
| 1009 | + {"type": "text", "text": "Compare these images."}, |
| 1010 | + ], |
| 1011 | + } |
| 1012 | + ], |
| 1013 | + phi3v_model_config_image_embeds, |
| 1014 | + phi3v_tokenizer, |
| 1015 | + content_format="string", |
| 1016 | + ) |
| 1017 | + |
| 1018 | + # Verify conversation structure |
| 1019 | + assert conversation == [ |
| 1020 | + { |
| 1021 | + "role": "user", |
| 1022 | + "content": "<|image_1|>\n<|image_2|>\nCompare these images.", |
| 1023 | + } |
| 1024 | + ] |
| 1025 | + |
| 1026 | + # Verify mm_data contains a list with None values (UUID references) |
| 1027 | + assert mm_data is not None |
| 1028 | + assert "image" in mm_data |
| 1029 | + assert isinstance(mm_data["image"], list) |
| 1030 | + assert len(mm_data["image"]) == 2 |
| 1031 | + assert mm_data["image"][0] is None |
| 1032 | + assert mm_data["image"][1] is None |
| 1033 | + |
| 1034 | + # Verify UUIDs are correctly tracked |
| 1035 | + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2]) |
| 1036 | + |
| 1037 | + |
| 1038 | +@pytest.mark.asyncio |
| 1039 | +async def test_parse_chat_messages_multiple_image_embeds_async( |
| 1040 | + phi3v_model_config_image_embeds, |
| 1041 | + phi3v_tokenizer, |
| 1042 | +): |
| 1043 | + """Test multiple image_embeds with async parsing. |
| 1044 | +
|
| 1045 | + This validates the AsyncMultiModalItemTracker also supports multiple embeddings. |
| 1046 | + """ |
| 1047 | + # Create two sample image embedding tensors |
| 1048 | + image_embedding_1 = torch.randn(200, 768) |
| 1049 | + image_embedding_2 = torch.randn(150, 768) |
| 1050 | + |
| 1051 | + # Encode them as base64 using the convenience function |
| 1052 | + base64_image_embedding_1 = tensor2base64(image_embedding_1) |
| 1053 | + base64_image_embedding_2 = tensor2base64(image_embedding_2) |
| 1054 | + |
| 1055 | + conversation, mm_future, mm_uuids = parse_chat_messages_futures( |
| 1056 | + [ |
| 1057 | + { |
| 1058 | + "role": "user", |
| 1059 | + "content": [ |
| 1060 | + { |
| 1061 | + "type": "image_embeds", |
| 1062 | + "image_embeds": base64_image_embedding_1, |
| 1063 | + }, |
| 1064 | + { |
| 1065 | + "type": "image_embeds", |
| 1066 | + "image_embeds": base64_image_embedding_2, |
| 1067 | + }, |
| 1068 | + {"type": "text", "text": "What do these images show?"}, |
| 1069 | + ], |
| 1070 | + } |
| 1071 | + ], |
| 1072 | + phi3v_model_config_image_embeds, |
| 1073 | + phi3v_tokenizer, |
| 1074 | + content_format="string", |
| 1075 | + ) |
| 1076 | + |
| 1077 | + # Verify conversation structure |
| 1078 | + assert conversation == [ |
| 1079 | + { |
| 1080 | + "role": "user", |
| 1081 | + "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?", |
| 1082 | + } |
| 1083 | + ] |
| 1084 | + |
| 1085 | + # Await the future and verify mm_data |
| 1086 | + mm_data = await mm_future |
| 1087 | + assert mm_data is not None |
| 1088 | + assert "image" in mm_data |
| 1089 | + assert isinstance(mm_data["image"], list) |
| 1090 | + assert len(mm_data["image"]) == 2 |
| 1091 | + |
| 1092 | + # Verify each embedding has the correct shape |
| 1093 | + assert isinstance(mm_data["image"][0], torch.Tensor) |
| 1094 | + assert mm_data["image"][0].shape == image_embedding_1.shape |
| 1095 | + assert isinstance(mm_data["image"][1], torch.Tensor) |
| 1096 | + assert mm_data["image"][1].shape == image_embedding_2.shape |
| 1097 | + |
| 1098 | + # Verify UUIDs |
| 1099 | + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) |
| 1100 | + |
| 1101 | + |
918 | 1102 | @pytest.mark.asyncio |
919 | 1103 | async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( |
920 | 1104 | phi3v_model_config_image_embeds, |
|
0 commit comments