1+ using System . Collections . Generic ;
2+ using System . IO ;
13using System . Text . RegularExpressions ;
24using LLama . Common ;
35using Spectre . Console ;
68
79namespace LLama . Examples . Examples
810{
9- // This example shows how to chat with LLaVA model with both image and text as input.
11+ // This example shows how to chat with Mtmd model with both image and text as input.
1012 // It uses the interactive executor to inference.
11- public class LlavaInteractiveModeExecute
13+ public class MtmdInteractiveModeExecute
1214 {
1315 public static async Task Run ( )
1416 {
1517 string multiModalProj = UserSettings . GetMMProjPath ( ) ;
1618 string modelPath = UserSettings . GetModelPath ( ) ;
1719 string modelImage = UserSettings . GetImagePath ( ) ;
18- const int maxTokens = 1024 ;
20+ const int maxTokens = 2048 ;
1921
2022 var prompt = $ "{{{modelImage}}}\n USER:\n Provide a full description of the image.\n ASSISTANT:\n ";
2123
2224 var parameters = new ModelParams ( modelPath ) ;
2325
26+ var mtmdParameters = MtmdContextParams . Default ( ) ;
27+ mtmdParameters . UseGpu = false ;
28+
2429 using var model = await LLamaWeights . LoadFromFileAsync ( parameters ) ;
2530 using var context = model . CreateContext ( parameters ) ;
26-
27- // Llava Init
28- using var clipModel = await LLavaWeights . LoadFromFileAsync ( multiModalProj ) ;
29-
31+
32+ // Mtmd Init
33+ using var clipModel = await SafeMtmdWeights . LoadFromFileAsync ( multiModalProj , model , mtmdParameters ) ;
34+
35+ var mediaMarker = mtmdParameters . MediaMarker ?? NativeApi . MtmdDefaultMarker ( ) ?? "<media>" ;
36+
3037 var ex = new InteractiveExecutor ( context , clipModel ) ;
3138
3239 Console . ForegroundColor = ConsoleColor . Yellow ;
@@ -40,38 +47,61 @@ public static async Task Run()
4047 Temperature = 0.1f
4148 } ,
4249
43- AntiPrompts = new List < string > { "\n USER :" } ,
50+ AntiPrompts = new List < string > { "\n ASSISTANT :" } ,
4451 MaxTokens = maxTokens
4552
4653 } ;
4754
4855 do
4956 {
5057
51- // Evaluate if we have images
58+ // Evaluate if we have media
5259 //
53- var imageMatches = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
54- var imageCount = imageMatches . Count ( ) ;
55- var hasImages = imageCount > 0 ;
60+ var mediaMatches = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
61+ var mediaCount = mediaMatches . Count ( ) ;
62+ var hasMedia = mediaCount > 0 ;
5663
57- if ( hasImages )
64+ if ( hasMedia )
5865 {
59- var imagePathsWithCurlyBraces = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
60- var imagePaths = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Groups [ 1 ] . Value ) . ToList ( ) ;
66+ var mediaPathsWithCurlyBraces = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Value ) ;
67+ var mediaPaths = Regex . Matches ( prompt , "{([^}]*)}" ) . Select ( m => m . Groups [ 1 ] . Value ) . ToList ( ) ;
6168
62- List < byte [ ] > imageBytes ;
69+ var embeds = new List < SafeMtmdEmbed > ( ) ;
70+ var imageList = new List < byte [ ] > ( ) ;
71+ var imageExtensions = new HashSet < string > ( StringComparer . OrdinalIgnoreCase )
72+ {
73+ ".png" ,
74+ ".jpg" ,
75+ ".jpeg" ,
76+ ".bmp" ,
77+ ".gif" ,
78+ ".webp"
79+ } ;
80+
6381 try
6482 {
65- imageBytes = imagePaths . Select ( File . ReadAllBytes ) . ToList ( ) ;
83+ foreach ( var mediaPath in mediaPaths )
84+ {
85+ var extension = Path . GetExtension ( mediaPath ) ;
86+ if ( ! string . IsNullOrEmpty ( extension ) && imageExtensions . Contains ( extension ) )
87+ {
88+ // Keep the raw image data so the caller can reuse or inspect the images later.
89+ imageList . Add ( File . ReadAllBytes ( mediaPath ) ) ;
90+ }
91+
92+ var embed = clipModel . LoadMedia ( mediaPath ) ;
93+ embeds . Add ( embed ) ;
94+ }
6695 }
6796 catch ( IOException exception )
6897 {
6998 Console . ForegroundColor = ConsoleColor . Red ;
7099 Console . Write (
71- $ "Could not load your { ( imageCount == 1 ? "image " : "images " ) } :") ;
100+ $ "Could not load your { ( mediaCount == 1 ? "media " : "medias " ) } :") ;
72101 Console . Write ( $ "{ exception . Message } ") ;
73102 Console . ForegroundColor = ConsoleColor . Yellow ;
74103 Console . WriteLine ( "Please try again." ) ;
104+ clipModel . ClearMedia ( ) ;
75105 break ;
76106 }
77107
@@ -81,19 +111,17 @@ public static async Task Run()
81111 // https://github.com/ggerganov/llama.cpp/discussions/3620
82112 ex . Context . NativeHandle . MemorySequenceRemove ( LLamaSeqId . Zero , - 1 , - 1 ) ;
83113
84- int index = 0 ;
85- foreach ( var path in imagePathsWithCurlyBraces )
114+ // Replace placeholders with media markers (one marker per image)
115+ foreach ( var path in mediaPathsWithCurlyBraces )
86116 {
87- // First image replace to tag <image, the rest of the images delete the tag
88- prompt = prompt . Replace ( path , index ++ == 0 ? "<image>" : "" ) ;
117+ prompt = prompt . Replace ( path , mediaMarker , StringComparison . Ordinal ) ;
89118 }
90119
91-
92120 Console . ForegroundColor = ConsoleColor . Yellow ;
93121 Console . WriteLine ( $ "Here are the images, that are sent to the chat model in addition to your message.") ;
94122 Console . WriteLine ( ) ;
95123
96- foreach ( var consoleImage in imageBytes ? . Select ( bytes => new CanvasImage ( bytes ) ) ?? Array . Empty < CanvasImage > ( ) )
124+ foreach ( var consoleImage in imageList . Select ( image => new CanvasImage ( image . ToArray ( ) ) ) )
97125 {
98126 consoleImage . MaxWidth = 50 ;
99127 AnsiConsole . Write ( consoleImage ) ;
@@ -108,10 +136,9 @@ public static async Task Run()
108136
109137 // Initialize Images in executor
110138 //
111- foreach ( var image in imagePaths )
112- {
113- ex . Images . Add ( await File . ReadAllBytesAsync ( image ) ) ;
114- }
139+ ex . Embeds . Clear ( ) ;
140+ foreach ( var embed in embeds )
141+ ex . Embeds . Add ( embed ) ;
115142 }
116143
117144 Console . ForegroundColor = Color . White ;
0 commit comments