@@ -119,6 +119,9 @@ let response = try await session.respond {
119119}
120120```
121121
122+ > [ !NOTE]
123+ > Image inputs are not yet supported by Apple Foundation Models.
124+
122125### Core ML
123126
124127Run [ Core ML] ( https://developer.apple.com/documentation/coreml ) models
@@ -143,6 +146,9 @@ Enable the trait in Package.swift:
143146)
144147```
145148
149+ > [ !NOTE]
150+ > Image inputs are not currently supported with ` CoreMLLanguageModel ` .
151+
146152### MLX
147153
148154Run [ MLX] ( https://github.com/ml-explore/mlx-swift ) models on Apple Silicon
@@ -157,6 +163,22 @@ let response = try await session.respond {
157163}
158164```
159165
166+ Vision support depends on the specific MLX model you load.
167+ Use a vision‑capable model for multimodal prompts
168+ (for example, a VLM variant).
169+ The following shows extracting text from an image:
170+
171+ ``` swift
172+ let ocr = try await session.respond (
173+ to : " Extract the total amount from this receipt" ,
174+ images : [
175+ .init (url : URL (fileURLWithPath : " /path/to/receipt_page1.png" )),
176+ .init (url : URL (fileURLWithPath : " /path/to/receipt_page2.png" ))
177+ ]
178+ )
179+ print (ocr.content )
180+ ```
181+
160182Enable the trait in Package.swift:
161183
162184``` swift
@@ -191,6 +213,9 @@ Enable the trait in Package.swift:
191213)
192214```
193215
216+ > [ !NOTE]
217+ > Image inputs are not currently supported with ` LlamaLanguageModel ` .
218+
194219### OpenAI
195220
196221Supports both
@@ -204,9 +229,17 @@ let model = OpenAILanguageModel(
204229)
205230
206231let session = LanguageModelSession (model : model)
207- let response = try await session.respond {
208- Prompt (" Write a haiku about Swift" )
209- }
232+ let response = try await session.respond (
233+ to : " List the objects you see" ,
234+ images : [
235+ .init (url : URL (string : " https://example.com/desk.jpg" )! ),
236+ .init (
237+ data : try Data (contentsOf : URL (fileURLWithPath : " /path/to/closeup.png" )),
238+ mimeType : " image/png"
239+ )
240+ ]
241+ )
242+ print (response.content )
210243```
211244
212245For OpenAI-compatible endpoints that use older Chat Completions API:
@@ -236,6 +269,20 @@ let response = try await session.respond {
236269}
237270```
238271
272+ You can include images with your prompt.
273+ You can point to remote URLs or construct from image data:
274+
275+ ``` swift
276+ let response = try await session.respond (
277+ to : " Explain the key parts of this diagram" ,
278+ image : .init (
279+ data : try Data (contentsOf : URL (fileURLWithPath : " /path/to/diagram.png" )),
280+ mimeType : " image/png"
281+ )
282+ )
283+ print (response.content )
284+ ```
285+
239286### Google Gemini
240287
241288Uses the [ Gemini API] ( https://ai.google.dev/api/generate-content ) with Gemini models:
@@ -252,6 +299,16 @@ let response = try await session.respond {
252299}
253300```
254301
302+ Send images with your prompt using remote or local sources:
303+
304+ ``` swift
305+ let response = try await session.respond (
306+ to : " Identify the plants in this photo" ,
307+ image : .init (url : URL (string : " https://example.com/garden.jpg" )! )
308+ )
309+ print (response.content )
310+ ```
311+
255312Gemini models use an internal [ "thinking process"] ( https://ai.google.dev/gemini-api/docs/thinking )
256313that improves reasoning and multi-step planning.
257314You can configure how much Gemini should "think" using the ` thinking ` parameter:
@@ -300,11 +357,12 @@ let model = GeminiLanguageModel(
300357
301358### Ollama
302359
303- Run models locally via Ollama's [ HTTP API] ( https://github.com/ollama/ollama/blob/main/docs/api.md ) :
360+ Run models locally via Ollama's
361+ [ HTTP API] ( https://github.com/ollama/ollama/blob/main/docs/api.md ) :
304362
305363``` swift
306364// Default: connects to http://localhost:11434
307- let model = OllamaLanguageModel (model : " qwen3" )
365+ let model = OllamaLanguageModel (model : " qwen3" ) // `ollama pull qwen3:8b`
308366
309367// Custom endpoint
310368let model = OllamaLanguageModel (
@@ -318,7 +376,22 @@ let response = try await session.respond {
318376}
319377```
320378
321- First, pull the model: ` ollama pull qwen3:0.6b `
379+ For local models, make sure you’re using a vision‑capable model
380+ (for example, a ` -vl ` variant).
381+ You can combine multiple images:
382+
383+ ``` swift
384+ let model = OllamaLanguageModel (model : " qwen3-vl" ) // `ollama pull qwen3-vl:8b`
385+ let session = LanguageModelSession (model : model)
386+ let response = try await session.respond (
387+ to : " Compare these posters and summarize their differences" ,
388+ images : [
389+ .init (url : URL (string : " https://example.com/poster1.jpg" )! ),
390+ .init (url : URL (fileURLWithPath : " /path/to/poster2.jpg" ))
391+ ]
392+ )
393+ print (response.content )
394+ ```
322395
323396## Testing
324397
0 commit comments