MCP: Image refs (#1987)

gary149 · web-flow · commit b7c8256ef3b2 · 2025-11-18T14:28:12.000+01:00
* Add image reference support for MCP tool calls

Introduces image reference resolution and attachment for MCP tool arguments, allowing tools to accept lightweight image reference strings (e.g. 'latest', 'image_1') and receive resolved image payloads. Updates tool invocation and flow logic to use these references, and improves tool prompt instructions for image input handling.

* Enable multiple file uploads in ChatInput

* Refactor multimodal model selection logic

* Remove 'latest' image reference support

Eliminates handling of the 'latest' image reference in image resolver logic and updates related comments and prompts to only support 'image_1', 'image_2', etc. This simplifies image reference usage and clarifies instructions for tool input parameters.

* Refactor imageRefs to fileRefs for tool payloads
diff --git a/.env b/.env
@@ -72,9 +72,9 @@ LLM_ROUTER_MAX_ASSISTANT_LENGTH=500
 # Maximum length (in characters) for previous user messages sent to router (latest user message not trimmed, default 400)
 LLM_ROUTER_MAX_PREV_USER_LENGTH=400
 
-# Enable router multimodal fallback (set to true to allow image inputs via router)
+# Enable router multimodal handling (set to true to allow image inputs via router)
 LLM_ROUTER_ENABLE_MULTIMODAL=
-# Optional: specific model to use for multimodal requests. If not set, uses first multimodal model
+# Required when LLM_ROUTER_ENABLE_MULTIMODAL=true: id or name of the multimodal model to use for image requests
 LLM_ROUTER_MULTIMODAL_MODEL=
 
 # Enable router tool support (set to true to allow tool calling via router)
diff --git a/README.md b/README.md
@@ -144,7 +144,7 @@ When you select Omni in the UI, Chat UI will:
 
 Tool and multimodal shortcuts:
 
-- Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses `LLM_ROUTER_MULTIMODAL_MODEL` (or the first multimodal model). Route name: `multimodal`.
+- Multimodal: If `LLM_ROUTER_ENABLE_MULTIMODAL=true` and the user sends an image, the router bypasses Arch and uses the model specified in `LLM_ROUTER_MULTIMODAL_MODEL`. Route name: `multimodal`.
 - Tools: If `LLM_ROUTER_ENABLE_TOOLS=true` and the user has at least one MCP server enabled, the router bypasses Arch and uses `LLM_ROUTER_TOOLS_MODEL`. If that model is missing or misconfigured, it falls back to Arch routing. Route name: `agentic`.
 
 ### MCP Tools (Optional)
diff --git a/chart/env/dev.yaml b/chart/env/dev.yaml
@@ -67,7 +67,7 @@ envVars:
   LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
   LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
   LLM_ROUTER_ENABLE_MULTIMODAL: "true"
-  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Thinking"
+  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >
diff --git a/chart/env/prod.yaml b/chart/env/prod.yaml
@@ -77,7 +77,7 @@ envVars:
   LLM_ROUTER_OTHER_ROUTE: "casual_conversation"
   LLM_ROUTER_ARCH_TIMEOUT_MS: "10000"
   LLM_ROUTER_ENABLE_MULTIMODAL: "true"
-  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-30B-A3B-Instruct"
+  LLM_ROUTER_MULTIMODAL_MODEL: "Qwen/Qwen3-VL-235B-A22B-Instruct"
   LLM_ROUTER_ENABLE_TOOLS: "true"
   LLM_ROUTER_TOOLS_MODEL: "moonshotai/Kimi-K2-Instruct-0905"
   MCP_SERVERS: >
diff --git a/src/lib/components/chat/ChatInput.svelte b/src/lib/components/chat/ChatInput.svelte
@@ -241,6 +241,7 @@
 						class="absolute hidden size-0"
 						aria-label="Upload file"
 						type="file"
+						multiple
 						onchange={onFileChange}
 						onclick={(e) => {
 							if (requireAuthUser()) {
@@ -274,7 +275,7 @@
 										onSelect={() => openFilePickerImage()}
 									>
 										<CarbonImage class="size-4 opacity-90 dark:opacity-80" />
-										Add image
+										Add image(s)
 									</DropdownMenu.Item>
 								{/if}
 
diff --git a/src/lib/server/router/endpoint.ts b/src/lib/server/router/endpoint.ts
@@ -18,6 +18,7 @@ import {
 	pickToolsCapableModel,
 	ROUTER_TOOLS_ROUTE,
 } from "./toolsRoute";
+import { getConfiguredMultimodalModelId } from "./multimodal";
 
 const REASONING_BLOCK_REGEX = /<think>[\s\S]*?(?:<\/think>|$)/g;
 
@@ -176,43 +177,17 @@ export async function makeRouterEndpoint(routerModel: ProcessedModel): Promise<E
 			for await (const ev of gen) yield ev;
 		}
 
-		async function findFirstMultimodalCandidateId(): Promise<string | undefined> {
+		if (routerMultimodalEnabled && hasImageInput) {
+			let multimodalCandidate: string | undefined;
 			try {
 				const all = await getModels();
-
-				// Check if a specific multimodal model is configured via env variable
-				const preferredModelId = config.LLM_ROUTER_MULTIMODAL_MODEL;
-				if (preferredModelId) {
-					const preferredModel = all?.find(
-						(m) => (m.id === preferredModelId || m.name === preferredModelId) && m.multimodal
-					);
-					if (preferredModel) {
-						logger.info(
-							{ model: preferredModel.id ?? preferredModel.name },
-							"[router] using configured multimodal model"
-						);
-						return preferredModel.id ?? preferredModel.name;
-					}
-					logger.warn(
-						{ configuredModel: preferredModelId },
-						"[router] configured multimodal model not found or not multimodal, falling back to first available"
-					);
-				}
-
-				// Fallback to first multimodal model
-				const first = all?.find((m) => !m.isRouter && m.multimodal);
-				return first?.id ?? first?.name;
+				multimodalCandidate = getConfiguredMultimodalModelId(all);
 			} catch (e) {
 				logger.warn({ err: String(e) }, "[router] failed to load models for multimodal lookup");
-				return undefined;
 			}
-		}
-
-		if (routerMultimodalEnabled && hasImageInput) {
-			const multimodalCandidate = await findFirstMultimodalCandidateId();
 			if (!multimodalCandidate) {
 				throw new Error(
-					"No multimodal models are configured for the router. Remove the image or enable a multimodal model."
+					"Router multimodal is enabled but LLM_ROUTER_MULTIMODAL_MODEL is not correctly configured. Remove the image or configure a multimodal model via LLM_ROUTER_MULTIMODAL_MODEL."
 				);
 			}
 
diff --git a/src/lib/server/router/multimodal.ts b/src/lib/server/router/multimodal.ts
@@ -0,0 +1,28 @@
+import { config } from "$lib/server/config";
+import type { ProcessedModel } from "../models";
+
+/**
+ * Returns the configured multimodal model when it exists and is valid.
+ * - Requires LLM_ROUTER_MULTIMODAL_MODEL to be set (id or name).
+ * - Ignores router aliases and non-multimodal models.
+ */
+export function findConfiguredMultimodalModel(
+	models: ProcessedModel[] | undefined
+): ProcessedModel | undefined {
+	const preferredModelId = (config.LLM_ROUTER_MULTIMODAL_MODEL || "").trim();
+	if (!preferredModelId || !models?.length) return undefined;
+
+	return models.find(
+		(candidate) =>
+			(candidate.id === preferredModelId || candidate.name === preferredModelId) &&
+			!candidate.isRouter &&
+			candidate.multimodal
+	);
+}
+
+export function getConfiguredMultimodalModelId(
+	models: ProcessedModel[] | undefined
+): string | undefined {
+	const model = findConfiguredMultimodalModel(models);
+	return model?.id ?? model?.name;
+}
diff --git a/src/lib/server/textGeneration/mcp/fileRefs.ts b/src/lib/server/textGeneration/mcp/fileRefs.ts
@@ -0,0 +1,168 @@
+import type { EndpointMessage } from "../../endpoints/endpoints";
+
+export type FileRefPayload = {
+	name: string;
+	mime: string;
+	base64: string;
+};
+
+export type RefKind = {
+	prefix: string;
+	matches: (mime: string) => boolean;
+	toDataUrl?: (payload: FileRefPayload) => string;
+};
+
+export type ResolvedFileRef = FileRefPayload & { refKind: RefKind };
+export type FileRefResolver = (ref: string) => ResolvedFileRef | undefined;
+
+const IMAGE_REF_KIND: RefKind = {
+	prefix: "image",
+	matches: (mime) => typeof mime === "string" && mime.startsWith("image/"),
+	toDataUrl: (payload) => `data:${payload.mime};base64,${payload.base64}`,
+};
+
+const DEFAULT_REF_KINDS: RefKind[] = [IMAGE_REF_KIND];
+
+/**
+ * Build a resolver that maps short ref strings (e.g. "image_1") to the
+ * corresponding file payload for the latest user message containing files of
+ * the allowed kinds. Currently only images are exposed to end users, but the
+ * plumbing supports additional kinds later.
+ */
+export function buildFileRefResolver(
+	messages: EndpointMessage[],
+	refKinds: RefKind[] = DEFAULT_REF_KINDS
+): FileRefResolver | undefined {
+	if (!Array.isArray(refKinds) || refKinds.length === 0) return undefined;
+
+	// Find the newest user message that has at least one matching file
+	let lastUserWithFiles: EndpointMessage | undefined;
+	for (let i = messages.length - 1; i >= 0; i -= 1) {
+		const msg = messages[i];
+		if (msg.from !== "user") continue;
+		const hasMatch = (msg.files ?? []).some((file) => {
+			const mime = file?.mime;
+			return refKinds.some((kind) => kind.matches(mime ?? ""));
+		});
+		if (hasMatch) {
+			lastUserWithFiles = msg;
+			break;
+		}
+	}
+
+	if (!lastUserWithFiles) return undefined;
+
+	// Bucket matched files by ref kind while preserving order within the message
+	const buckets = new Map<RefKind, FileRefPayload[]>();
+	for (const file of lastUserWithFiles.files ?? []) {
+		const mime = file?.mime ?? "";
+		const kind = refKinds.find((k) => k.matches(mime));
+		if (!kind) continue;
+		const payload: FileRefPayload = { name: file.name, mime, base64: file.value };
+		const arr = buckets.get(kind) ?? [];
+		arr.push(payload);
+		buckets.set(kind, arr);
+	}
+
+	if (buckets.size === 0) return undefined;
+
+	const resolver: FileRefResolver = (ref) => {
+		if (!ref || typeof ref !== "string") return undefined;
+		const trimmed = ref.trim().toLowerCase();
+		for (const kind of refKinds) {
+			const match = new RegExp(`^${kind.prefix}_(\\d+)$`).exec(trimmed);
+			if (!match) continue;
+			const idx = Number(match[1]) - 1;
+			const files = buckets.get(kind) ?? [];
+			if (Number.isFinite(idx) && idx >= 0 && idx < files.length) {
+				const payload = files[idx];
+				return payload ? { ...payload, refKind: kind } : undefined;
+			}
+		}
+		return undefined;
+	};
+
+	return resolver;
+}
+
+export function buildImageRefResolver(messages: EndpointMessage[]): FileRefResolver | undefined {
+	return buildFileRefResolver(messages, [IMAGE_REF_KIND]);
+}
+
+type FieldRule = {
+	keys: string[];
+	action: "attachPayload" | "replaceWithDataUrl";
+	attachKey?: string;
+	allowedPrefixes?: string[]; // limit to specific ref kinds (e.g. ["image"])
+};
+
+const DEFAULT_FIELD_RULES: FieldRule[] = [
+	{
+		keys: ["image_ref"],
+		action: "attachPayload",
+		attachKey: "image",
+		allowedPrefixes: ["image"],
+	},
+	{
+		keys: ["input_image"],
+		action: "replaceWithDataUrl",
+		allowedPrefixes: ["image"],
+	},
+];
+
+/**
+ * Walk tool args and hydrate known ref fields while keeping logging lightweight.
+ * Only image refs are recognized for now to preserve current behavior.
+ */
+export function attachFileRefsToArgs(
+	argsObj: Record<string, unknown>,
+	resolveRef?: FileRefResolver,
+	fieldRules: FieldRule[] = DEFAULT_FIELD_RULES
+): void {
+	if (!resolveRef) return;
+
+	const visit = (node: unknown): void => {
+		if (!node || typeof node !== "object") return;
+		if (Array.isArray(node)) {
+			for (const v of node) visit(v);
+			return;
+		}
+
+		const obj = node as Record<string, unknown>;
+		for (const [key, value] of Object.entries(obj)) {
+			if (typeof value !== "string") {
+				if (value && typeof value === "object") visit(value);
+				continue;
+			}
+
+			const resolved = resolveRef(value);
+			if (!resolved) continue;
+
+			const rule = fieldRules.find((r) => r.keys.includes(key));
+			if (!rule) continue;
+			if (rule.allowedPrefixes && !rule.allowedPrefixes.includes(resolved.refKind.prefix)) continue;
+
+			if (rule.action === "attachPayload") {
+				const targetKey = rule.attachKey ?? "file";
+				if (
+					typeof obj[targetKey] !== "object" ||
+					obj[targetKey] === null ||
+					Array.isArray(obj[targetKey])
+				) {
+					obj[targetKey] = {
+						name: resolved.name,
+						mime: resolved.mime,
+						base64: resolved.base64,
+					};
+				}
+			} else if (rule.action === "replaceWithDataUrl") {
+				const toUrl =
+					resolved.refKind.toDataUrl ??
+					((p: FileRefPayload) => `data:${p.mime};base64,${p.base64}`);
+				obj[key] = toUrl(resolved);
+			}
+		}
+	};
+
+	visit(argsObj);
+}
diff --git a/src/lib/server/textGeneration/mcp/routerResolution.ts b/src/lib/server/textGeneration/mcp/routerResolution.ts
@@ -7,6 +7,7 @@ import {
 	pickToolsCapableModel,
 	ROUTER_TOOLS_ROUTE,
 } from "$lib/server/router/toolsRoute";
+import { findConfiguredMultimodalModel } from "$lib/server/router/multimodal";
 import type { EndpointMessage } from "../../endpoints/endpoints";
 import { stripReasoningFromMessageForRouting } from "../utils/routing";
 import type { ProcessedModel } from "../../models";
@@ -48,15 +49,17 @@ export async function resolveRouterTarget({
 		const allModels = mod.models as ProcessedModel[];
 
 		if (hasImageInput) {
-			const multimodalCandidate = allModels?.find(
-				(candidate) => !candidate.isRouter && candidate.multimodal
-			);
-			if (multimodalCandidate) {
+			const multimodalCandidate = findConfiguredMultimodalModel(allModels);
+			if (!multimodalCandidate) {
+				runMcp = false;
+				logger.warn(
+					{ configuredModel: config.LLM_ROUTER_MULTIMODAL_MODEL },
+					"[mcp] multimodal input but configured model missing or invalid; skipping MCP route"
+				);
+			} else {
 				targetModel = multimodalCandidate;
 				candidateModelId = multimodalCandidate.id ?? multimodalCandidate.name;
 				resolvedRoute = "multimodal";
-			} else {
-				runMcp = false;
 			}
 		} else {
 			// If tools are enabled and at least one MCP server is active, prefer a tools-capable model
diff --git a/src/lib/server/textGeneration/mcp/runMcpFlow.ts b/src/lib/server/textGeneration/mcp/runMcpFlow.ts
@@ -1,6 +1,5 @@
 import { config } from "$lib/server/config";
 import { MessageUpdateType, type MessageUpdate } from "$lib/types/MessageUpdate";
-import type { EndpointMessage } from "../../endpoints/endpoints";
 import { getMcpServers } from "$lib/server/mcp/registry";
 import { isValidUrl } from "$lib/server/urlSafety";
 import { resetMcpToolsCache } from "$lib/server/mcp/tools";
@@ -14,11 +13,13 @@ import type {
 } from "openai/resources/chat/completions";
 import type { Stream } from "openai/streaming";
 import { buildToolPreprompt } from "../utils/toolPrompt";
+import type { EndpointMessage } from "../../endpoints/endpoints";
 import { resolveRouterTarget } from "./routerResolution";
 import { executeToolCalls, type NormalizedToolCall } from "./toolInvocation";
 import { drainPool } from "$lib/server/mcp/clientPool";
 import type { TextGenerationContext } from "../types";
 import { hasAuthHeader, isStrictHfMcpLogin, hasNonEmptyToken } from "$lib/server/mcp/hf";
+import { buildImageRefResolver } from "./fileRefs";
 
 export type RunMcpFlowContext = Pick<
 	TextGenerationContext,
@@ -200,6 +201,8 @@ export async function* runMcpFlow({
 		// If anything goes wrong reading the flag, proceed (previous behavior)
 	}
 
+	const resolveFileRef = buildImageRefResolver(messages);
+
 	const hasImageInput = messages.some((msg) =>
 		(msg.files ?? []).some(
 			(file) => typeof file?.mime === "string" && file.mime.startsWith("image/")
@@ -599,6 +602,7 @@ export async function* runMcpFlow({
 					mapping,
 					servers,
 					parseArgs,
+					resolveFileRef,
 					toPrimitive,
 					processToolOutput,
 					abortSignal,
diff --git a/src/lib/server/textGeneration/mcp/toolInvocation.ts b/src/lib/server/textGeneration/mcp/toolInvocation.ts
diff --git a/src/lib/server/textGeneration/utils/toolPrompt.ts b/src/lib/server/textGeneration/utils/toolPrompt.ts