Skip to content

Commit b7c8256

Browse files
authored
MCP: Image refs (#1987)
* Add image reference support for MCP tool calls Introduces image reference resolution and attachment for MCP tool arguments, allowing tools to accept lightweight image reference strings (e.g. 'latest', 'image_1') and receive resolved image payloads. Updates tool invocation and flow logic to use these references, and improves tool prompt instructions for image input handling. * Enable multiple file uploads in ChatInput * Refactor multimodal model selection logic * Remove 'latest' image reference support Eliminates handling of the 'latest' image reference in image resolver logic and updates related comments and prompts to only support 'image_1', 'image_2', etc. This simplifies image reference usage and clarifies instructions for tool input parameters. * Refactor imageRefs to fileRefs for tool payloads
1 parent a0a94c7 commit b7c8256

File tree

12 files changed

+237
-44
lines changed

12 files changed

+237
-44
lines changed

.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ LLM_ROUTER_MAX_ASSISTANT_LENGTH=500
7272
# Maximum length (in characters) for previous user messages sent to router (latest user message not trimmed, default 400)
7373
LLM_ROUTER_MAX_PREV_USER_LENGTH=400
7474

75-
# Enable router multimodal fallback (set to true to allow image inputs via router)
75+
# Enable router multimodal handling (set to true to allow image inputs via router)
7676
LLM_ROUTER_ENABLE_MULTIMODAL=
77-
# Optional: specific model to use for multimodal requests. If not set, uses first multimodal model
77+
# Required when LLM_ROUTER_ENABLE_MULTIMODAL=true: id or name of the multimodal model to use for image requests
7878
LLM_ROUTER_MULTIMODAL_MODEL=
7979

8080
# Enable router tool support (set to true to allow tool calling via router)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ When you select Omni in the UI, Chat UI will:
144144

145145
Tool and multimodal shortcuts:
146146

147-
- Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses `LLM_ROUTER_MULTIMODAL_MODEL` (or the first multimodal model). Route name: `multimodal`.
147+
- Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses the model specified in `LLM_ROUTER_MULTIMODAL_MODEL`. Route name: `multimodal`.
148148
- Tools: If `LLM_ROUTER_ENABLE_TOOLS=true` and the user has at least one MCP server enabled, the router bypasses Arch and uses `LLM_ROUTER_TOOLS_MODEL`. If that model is missing or misconfigured, it falls back to Arch routing. Route name: `agentic`.
149149

150150
### MCP Tools (Optional)

chart/env/dev.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ envVars:
6767
LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
6868
LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
6969
LLM_ROUTER_ENABLE_MULTIMODAL: "true"
70-
LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Thinking"
70+
LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
7171
LLM_ROUTER_ENABLE_TOOLS: "true"
7272
LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
7373
MCP_SERVERS: >

chart/env/prod.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ envVars:
7777
LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
7878
LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
7979
LLM_ROUTER_ENABLE_MULTIMODAL: "true"
80-
LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-30B-A3B-Instruct"
80+
LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
8181
LLM_ROUTER_ENABLE_TOOLS: "true"
8282
LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
8383
MCP_SERVERS: >

src/lib/components/chat/ChatInput.svelte

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@
241241
class="absolute hidden size-0"
242242
aria-label="Upload file"
243243
type="file"
244+
multiple
244245
onchange={onFileChange}
245246
onclick={(e) => {
246247
if (requireAuthUser()) {
@@ -274,7 +275,7 @@
274275
onSelect={() => openFilePickerImage()}
275276
>
276277
<CarbonImage class="size-4 opacity-90 dark:opacity-80" />
277-
Add image
278+
Add image(s)
278279
</DropdownMenu.Item>
279280
{/if}
280281

src/lib/server/router/endpoint.ts

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import {
1818
pickToolsCapableModel,
1919
ROUTER_TOOLS_ROUTE,
2020
} from "./toolsRoute";
21+
import { getConfiguredMultimodalModelId } from "./multimodal";
2122

2223
const REASONING_BLOCK_REGEX = /<think>[\s\S]*?(?:<\/think>|$)/g;
2324

@@ -176,43 +177,17 @@ export async function makeRouterEndpoint(routerModel: ProcessedModel): Promise<E
176177
for await (const ev of gen) yield ev;
177178
}
178179

179-
async function findFirstMultimodalCandidateId(): Promise<string | undefined> {
180+
if (routerMultimodalEnabled && hasImageInput) {
181+
let multimodalCandidate: string | undefined;
180182
try {
181183
const all = await getModels();
182-
183-
// Check if a specific multimodal model is configured via env variable
184-
const preferredModelId = config.LLM_ROUTER_MULTIMODAL_MODEL;
185-
if (preferredModelId) {
186-
const preferredModel = all?.find(
187-
(m) => (m.id === preferredModelId || m.name === preferredModelId) && m.multimodal
188-
);
189-
if (preferredModel) {
190-
logger.info(
191-
{ model: preferredModel.id ?? preferredModel.name },
192-
"[router] using configured multimodal model"
193-
);
194-
return preferredModel.id ?? preferredModel.name;
195-
}
196-
logger.warn(
197-
{ configuredModel: preferredModelId },
198-
"[router] configured multimodal model not found or not multimodal, falling back to first available"
199-
);
200-
}
201-
202-
// Fallback to first multimodal model
203-
const first = all?.find((m) => !m.isRouter && m.multimodal);
204-
return first?.id ?? first?.name;
184+
multimodalCandidate = getConfiguredMultimodalModelId(all);
205185
} catch (e) {
206186
logger.warn({ err: String(e) }, "[router] failed to load models for multimodal lookup");
207-
return undefined;
208187
}
209-
}
210-
211-
if (routerMultimodalEnabled && hasImageInput) {
212-
const multimodalCandidate = await findFirstMultimodalCandidateId();
213188
if (!multimodalCandidate) {
214189
throw new Error(
215-
"No multimodal models are configured for the router. Remove the image or enable a multimodal model."
190+
"Router multimodal is enabled but LLM_ROUTER_MULTIMODAL_MODEL is not correctly configured. Remove the image or configure a multimodal model via LLM_ROUTER_MULTIMODAL_MODEL."
216191
);
217192
}
218193

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import { config } from "$lib/server/config";
2+
import type { ProcessedModel } from "../models";
3+
4+
/**
5+
* Returns the configured multimodal model when it exists and is valid.
6+
* - Requires LLM_ROUTER_MULTIMODAL_MODEL to be set (id or name).
7+
* - Ignores router aliases and non-multimodal models.
8+
*/
9+
export function findConfiguredMultimodalModel(
10+
models: ProcessedModel[] | undefined
11+
): ProcessedModel | undefined {
12+
const preferredModelId = (config.LLM_ROUTER_MULTIMODAL_MODEL || "").trim();
13+
if (!preferredModelId || !models?.length) return undefined;
14+
15+
return models.find(
16+
(candidate) =>
17+
(candidate.id === preferredModelId || candidate.name === preferredModelId) &&
18+
!candidate.isRouter &&
19+
candidate.multimodal
20+
);
21+
}
22+
23+
export function getConfiguredMultimodalModelId(
24+
models: ProcessedModel[] | undefined
25+
): string | undefined {
26+
const model = findConfiguredMultimodalModel(models);
27+
return model?.id ?? model?.name;
28+
}
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import type { EndpointMessage } from "../../endpoints/endpoints";
2+
3+
export type FileRefPayload = {
4+
name: string;
5+
mime: string;
6+
base64: string;
7+
};
8+
9+
export type RefKind = {
10+
prefix: string;
11+
matches: (mime: string) => boolean;
12+
toDataUrl?: (payload: FileRefPayload) => string;
13+
};
14+
15+
export type ResolvedFileRef = FileRefPayload & { refKind: RefKind };
16+
export type FileRefResolver = (ref: string) => ResolvedFileRef | undefined;
17+
18+
const IMAGE_REF_KIND: RefKind = {
19+
prefix: "image",
20+
matches: (mime) => typeof mime === "string" && mime.startsWith("image/"),
21+
toDataUrl: (payload) => `data:${payload.mime};base64,${payload.base64}`,
22+
};
23+
24+
const DEFAULT_REF_KINDS: RefKind[] = [IMAGE_REF_KIND];
25+
26+
/**
27+
* Build a resolver that maps short ref strings (e.g. "image_1") to the
28+
* corresponding file payload for the latest user message containing files of
29+
* the allowed kinds. Currently only images are exposed to end users, but the
30+
* plumbing supports additional kinds later.
31+
*/
32+
export function buildFileRefResolver(
33+
messages: EndpointMessage[],
34+
refKinds: RefKind[] = DEFAULT_REF_KINDS
35+
): FileRefResolver | undefined {
36+
if (!Array.isArray(refKinds) || refKinds.length === 0) return undefined;
37+
38+
// Find the newest user message that has at least one matching file
39+
let lastUserWithFiles: EndpointMessage | undefined;
40+
for (let i = messages.length - 1; i >= 0; i -= 1) {
41+
const msg = messages[i];
42+
if (msg.from !== "user") continue;
43+
const hasMatch = (msg.files ?? []).some((file) => {
44+
const mime = file?.mime;
45+
return refKinds.some((kind) => kind.matches(mime ?? ""));
46+
});
47+
if (hasMatch) {
48+
lastUserWithFiles = msg;
49+
break;
50+
}
51+
}
52+
53+
if (!lastUserWithFiles) return undefined;
54+
55+
// Bucket matched files by ref kind while preserving order within the message
56+
const buckets = new Map<RefKind, FileRefPayload[]>();
57+
for (const file of lastUserWithFiles.files ?? []) {
58+
const mime = file?.mime ?? "";
59+
const kind = refKinds.find((k) => k.matches(mime));
60+
if (!kind) continue;
61+
const payload: FileRefPayload = { name: file.name, mime, base64: file.value };
62+
const arr = buckets.get(kind) ?? [];
63+
arr.push(payload);
64+
buckets.set(kind, arr);
65+
}
66+
67+
if (buckets.size === 0) return undefined;
68+
69+
const resolver: FileRefResolver = (ref) => {
70+
if (!ref || typeof ref !== "string") return undefined;
71+
const trimmed = ref.trim().toLowerCase();
72+
for (const kind of refKinds) {
73+
const match = new RegExp(`^${kind.prefix}_(\\d+)$`).exec(trimmed);
74+
if (!match) continue;
75+
const idx = Number(match[1]) - 1;
76+
const files = buckets.get(kind) ?? [];
77+
if (Number.isFinite(idx) && idx >= 0 && idx < files.length) {
78+
const payload = files[idx];
79+
return payload ? { ...payload, refKind: kind } : undefined;
80+
}
81+
}
82+
return undefined;
83+
};
84+
85+
return resolver;
86+
}
87+
88+
export function buildImageRefResolver(messages: EndpointMessage[]): FileRefResolver | undefined {
89+
return buildFileRefResolver(messages, [IMAGE_REF_KIND]);
90+
}
91+
92+
type FieldRule = {
93+
keys: string[];
94+
action: "attachPayload" | "replaceWithDataUrl";
95+
attachKey?: string;
96+
allowedPrefixes?: string[]; // limit to specific ref kinds (e.g. ["image"])
97+
};
98+
99+
const DEFAULT_FIELD_RULES: FieldRule[] = [
100+
{
101+
keys: ["image_ref"],
102+
action: "attachPayload",
103+
attachKey: "image",
104+
allowedPrefixes: ["image"],
105+
},
106+
{
107+
keys: ["input_image"],
108+
action: "replaceWithDataUrl",
109+
allowedPrefixes: ["image"],
110+
},
111+
];
112+
113+
/**
114+
* Walk tool args and hydrate known ref fields while keeping logging lightweight.
115+
* Only image refs are recognized for now to preserve current behavior.
116+
*/
117+
export function attachFileRefsToArgs(
118+
argsObj: Record<string, unknown>,
119+
resolveRef?: FileRefResolver,
120+
fieldRules: FieldRule[] = DEFAULT_FIELD_RULES
121+
): void {
122+
if (!resolveRef) return;
123+
124+
const visit = (node: unknown): void => {
125+
if (!node || typeof node !== "object") return;
126+
if (Array.isArray(node)) {
127+
for (const v of node) visit(v);
128+
return;
129+
}
130+
131+
const obj = node as Record<string, unknown>;
132+
for (const [key, value] of Object.entries(obj)) {
133+
if (typeof value !== "string") {
134+
if (value && typeof value === "object") visit(value);
135+
continue;
136+
}
137+
138+
const resolved = resolveRef(value);
139+
if (!resolved) continue;
140+
141+
const rule = fieldRules.find((r) => r.keys.includes(key));
142+
if (!rule) continue;
143+
if (rule.allowedPrefixes && !rule.allowedPrefixes.includes(resolved.refKind.prefix)) continue;
144+
145+
if (rule.action === "attachPayload") {
146+
const targetKey = rule.attachKey ?? "file";
147+
if (
148+
typeof obj[targetKey] !== "object" ||
149+
obj[targetKey] === null ||
150+
Array.isArray(obj[targetKey])
151+
) {
152+
obj[targetKey] = {
153+
name: resolved.name,
154+
mime: resolved.mime,
155+
base64: resolved.base64,
156+
};
157+
}
158+
} else if (rule.action === "replaceWithDataUrl") {
159+
const toUrl =
160+
resolved.refKind.toDataUrl ??
161+
((p: FileRefPayload) => `data:${p.mime};base64,${p.base64}`);
162+
obj[key] = toUrl(resolved);
163+
}
164+
}
165+
};
166+
167+
visit(argsObj);
168+
}

src/lib/server/textGeneration/mcp/routerResolution.ts

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
pickToolsCapableModel,
88
ROUTER_TOOLS_ROUTE,
99
} from "$lib/server/router/toolsRoute";
10+
import { findConfiguredMultimodalModel } from "$lib/server/router/multimodal";
1011
import type { EndpointMessage } from "../../endpoints/endpoints";
1112
import { stripReasoningFromMessageForRouting } from "../utils/routing";
1213
import type { ProcessedModel } from "../../models";
@@ -48,15 +49,17 @@ export async function resolveRouterTarget({
4849
const allModels = mod.models as ProcessedModel[];
4950

5051
if (hasImageInput) {
51-
const multimodalCandidate = allModels?.find(
52-
(candidate) => !candidate.isRouter && candidate.multimodal
53-
);
54-
if (multimodalCandidate) {
52+
const multimodalCandidate = findConfiguredMultimodalModel(allModels);
53+
if (!multimodalCandidate) {
54+
runMcp = false;
55+
logger.warn(
56+
{ configuredModel: config.LLM_ROUTER_MULTIMODAL_MODEL },
57+
"[mcp] multimodal input but configured model missing or invalid; skipping MCP route"
58+
);
59+
} else {
5560
targetModel = multimodalCandidate;
5661
candidateModelId = multimodalCandidate.id ?? multimodalCandidate.name;
5762
resolvedRoute = "multimodal";
58-
} else {
59-
runMcp = false;
6063
}
6164
} else {
6265
// If tools are enabled and at least one MCP server is active, prefer a tools-capable model

src/lib/server/textGeneration/mcp/runMcpFlow.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import { config } from "$lib/server/config";
22
import { MessageUpdateType, type MessageUpdate } from "$lib/types/MessageUpdate";
3-
import type { EndpointMessage } from "../../endpoints/endpoints";
43
import { getMcpServers } from "$lib/server/mcp/registry";
54
import { isValidUrl } from "$lib/server/urlSafety";
65
import { resetMcpToolsCache } from "$lib/server/mcp/tools";
@@ -14,11 +13,13 @@ import type {
1413
} from "openai/resources/chat/completions";
1514
import type { Stream } from "openai/streaming";
1615
import { buildToolPreprompt } from "../utils/toolPrompt";
16+
import type { EndpointMessage } from "../../endpoints/endpoints";
1717
import { resolveRouterTarget } from "./routerResolution";
1818
import { executeToolCalls, type NormalizedToolCall } from "./toolInvocation";
1919
import { drainPool } from "$lib/server/mcp/clientPool";
2020
import type { TextGenerationContext } from "../types";
2121
import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
22+
import { buildImageRefResolver } from "./fileRefs";
2223

2324
export type RunMcpFlowContext = Pick<
2425
TextGenerationContext,
@@ -200,6 +201,8 @@ export async function* runMcpFlow({
200201
// If anything goes wrong reading the flag, proceed (previous behavior)
201202
}
202203

204+
const resolveFileRef = buildImageRefResolver(messages);
205+
203206
const hasImageInput = messages.some((msg) =>
204207
(msg.files ?? []).some(
205208
(file) => typeof file?.mime === "string" && file.mime.startsWith("image/")
@@ -599,6 +602,7 @@ export async function* runMcpFlow({
599602
mapping,
600603
servers,
601604
parseArgs,
605+
resolveFileRef,
602606
toPrimitive,
603607
processToolOutput,
604608
abortSignal,

0 commit comments

Comments
 (0)