diff --git a/packages/core/examples/gpt54-cua-example.ts b/packages/core/examples/gpt54-cua-example.ts new file mode 100644 index 000000000..6cfa91bdf --- /dev/null +++ b/packages/core/examples/gpt54-cua-example.ts @@ -0,0 +1,47 @@ +import { Stagehand } from "../lib/v3/index.js"; +import chalk from "chalk"; + +async function main() { + console.log(`\n${chalk.bold("Stagehand ๐Ÿค˜ GPT-5.4 CUA Demo")}\n`); + + const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 2, + }); + await stagehand.init(); + + try { + const page = stagehand.context.pages()[0]; + + const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "openai/gpt-5.4-2026-03-05", + apiKey: process.env.OPENAI_API_KEY, + }, + systemPrompt: `You are a helpful assistant that can use a web browser. + Do not ask follow up questions, the user will trust your judgement. + Today's date is ${new Date().toLocaleDateString()}.`, + }); + + await page.goto("https://news.ycombinator.com"); + + const instruction = + "Find the top story on Hacker News and tell me its title, link, and point count and then click on it and extract a summary for me"; + console.log(`Instruction: ${chalk.white(instruction)}`); + + const result = await agent.execute({ + instruction, + maxSteps: 10, + }); + + console.log(`\n${chalk.green("โœ“")} Done`); + console.log(`${chalk.yellow("โคท")} ${result.message}`); + } catch (error) { + console.error(`${chalk.red("โœ—")} Error:`, error); + } finally { + await stagehand.close(); + } +} + +main(); diff --git a/packages/core/lib/v3/agent/AgentProvider.ts b/packages/core/lib/v3/agent/AgentProvider.ts index 248e61805..e7ecf42b1 100644 --- a/packages/core/lib/v3/agent/AgentProvider.ts +++ b/packages/core/lib/v3/agent/AgentProvider.ts @@ -14,6 +14,7 @@ import { MicrosoftCUAClient } from "./MicrosoftCUAClient.js"; // Map model names to their provider types export const modelToAgentProviderMap: Record = { + "gpt-5.4-2026-03-05": "openai", "computer-use-preview": "openai", "computer-use-preview-2025-03-11": "openai", "claude-sonnet-4-20250514": "anthropic", diff --git a/packages/core/lib/v3/agent/OpenAICUAClient.ts b/packages/core/lib/v3/agent/OpenAICUAClient.ts index 638308e28..df86a7109 100644 --- a/packages/core/lib/v3/agent/OpenAICUAClient.ts +++ b/packages/core/lib/v3/agent/OpenAICUAClient.ts @@ -45,6 +45,10 @@ export class OpenAICUAClient extends AgentClient { private tools?: ToolSet; private safetyConfirmationHandler?: SafetyConfirmationHandler; + private get usesNewComputerTool(): boolean { + return this.modelName.startsWith("gpt-5"); + } + constructor( type: AgentType, modelName: string, @@ -256,17 +260,12 @@ export class OpenAICUAClient extends AgentClient { const stepActions: AgentAction[] = []; for (const item of output) { if (item.type === "computer_call" && this.isComputerCallItem(item)) { - logger({ - category: "agent", - message: `Found computer_call: ${item.action.type}, call_id: ${item.call_id}`, - level: 2, - }); - const action = this.convertComputerCallToAction(item); - if (action) { + const actions = this.convertComputerCallToActions(item); + for (const action of actions) { stepActions.push(action); logger({ category: "agent", - message: `Converted computer_call to action: ${action.type}`, + message: `Found computer_call action: ${action.type}, call_id: ${item.call_id}`, level: 2, }); } @@ -350,8 +349,8 @@ export class OpenAICUAClient extends AgentClient { return ( item.type === "computer_call" && "call_id" in item && - "action" in item && - typeof item.action === "object" + (("action" in item && typeof item.action === "object") || + ("actions" in item && Array.isArray(item.actions))) ); } @@ -426,19 +425,21 @@ export class OpenAICUAClient extends AgentClient { usage: Record; }> { try { - // Create the request parameters - const requestParams: Record = { - model: this.modelName, - tools: [ - { - type: "computer_use_preview", + // Create the request parameters, branching on tool format + const computerTool = this.usesNewComputerTool + ? { type: "computer" as const } + : { + type: "computer_use_preview" as const, display_width: this.currentViewport.width, display_height: this.currentViewport.height, environment: this.environment, - }, - ], + }; + + const requestParams: Record = { + model: this.modelName, + tools: [computerTool], input: inputItems, - truncation: "auto", + ...(this.usesNewComputerTool ? {} : { truncation: "auto" }), }; // Add custom tools if available @@ -521,29 +522,36 @@ export class OpenAICUAClient extends AgentClient { // Process each output item for (const item of output) { if (item.type === "computer_call" && this.isComputerCallItem(item)) { - // Handle computer calls + // Handle computer calls (both single-action and batched-actions formats) try { - const action = this.convertComputerCallToAction(item); + const actions = this.convertComputerCallToActions(item); - if (action && this.actionHandler) { - logger({ - category: "agent", - message: `Executing computer action: ${action.type}`, - level: 1, - }); - await this.actionHandler(action); + if (this.actionHandler) { + for (const action of actions) { + logger({ + category: "agent", + message: `Executing computer action: ${action.type}`, + level: 1, + }); + await this.actionHandler(action); + } } - // Capture a screenshot + // Capture a screenshot after all actions in the batch const screenshot = await this.captureScreenshot(); - // Create a computer_call_output for the next request + // Build the output โ€” use "computer_screenshot" for new format, "input_image" for legacy + const outputType = this.usesNewComputerTool + ? ("computer_screenshot" as const) + : ("input_image" as const); + const outputItem = { type: "computer_call_output" as const, call_id: item.call_id, output: { - type: "input_image" as const, + type: outputType, image_url: screenshot, + ...(this.usesNewComputerTool ? { detail: "original" as const } : {}), }, } as ResponseInputItem; @@ -553,13 +561,13 @@ export class OpenAICUAClient extends AgentClient { level: 2, }); - // Add current URL if available - if (this.currentUrl) { + // Legacy format supports current_url on the output; new format does not + if (!this.usesNewComputerTool && this.currentUrl) { const computerCallOutput = outputItem as { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; current_url?: string; }; @@ -582,7 +590,7 @@ export class OpenAICUAClient extends AgentClient { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; }; acknowledged_safety_checks?: SafetyCheck[]; @@ -607,26 +615,29 @@ export class OpenAICUAClient extends AgentClient { }); try { - // Capture a screenshot even on error const screenshot = await this.captureScreenshot(); + const outputType = this.usesNewComputerTool + ? ("computer_screenshot" as const) + : ("input_image" as const); + const errorOutputItem = { type: "computer_call_output" as const, call_id: item.call_id, output: { - type: "input_image" as const, + type: outputType, image_url: screenshot, error: errorMessage, + ...(this.usesNewComputerTool ? { detail: "original" as const } : {}), }, } as ResponseInputItem; - // Add current URL if available - if (this.currentUrl) { + if (!this.usesNewComputerTool && this.currentUrl) { const computerCallOutput = errorOutputItem as { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; current_url?: string; }; @@ -649,7 +660,7 @@ export class OpenAICUAClient extends AgentClient { type: "computer_call_output"; call_id: string; output: { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; }; acknowledged_safety_checks?: SafetyCheck[]; @@ -664,14 +675,12 @@ export class OpenAICUAClient extends AgentClient { if (screenshotError instanceof StagehandClosedError) { throw screenshotError; } - // If we can't capture a screenshot, just send the error logger({ category: "agent", message: `Error capturing screenshot: ${String(screenshotError)}`, level: 0, }); - // For error cases without a screenshot, we need to use a string output nextInputItems.push({ type: "computer_call_output", call_id: item.call_id, @@ -770,15 +779,31 @@ export class OpenAICUAClient extends AgentClient { call: ComputerCallItem, ): AgentAction | null { const { action } = call; + if (!action) return null; - // Instead of wrapping the action in a params object, spread the action properties directly - // This ensures properties like x, y, button, etc. are directly accessible on the AgentAction return { type: action.type as string, - ...action, // Spread all properties from the action + ...action, }; } + /** + * Convert a batched computer_call (GPT-5.4 format) into multiple AgentActions. + */ + private convertComputerCallToActions( + call: ComputerCallItem, + ): AgentAction[] { + if (call.actions && Array.isArray(call.actions)) { + return call.actions.map((action) => ({ + type: action.type as string, + ...action, + })); + } + + const single = this.convertComputerCallToAction(call); + return single ? [single] : []; + } + private convertFunctionCallToAction( call: FunctionCallItem, ): AgentAction | null { diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts index 3e9cf258a..c9ea821f0 100644 --- a/packages/core/lib/v3/types/public/agent.ts +++ b/packages/core/lib/v3/types/public/agent.ts @@ -435,6 +435,7 @@ export type AgentType = | "bedrock"; export const AVAILABLE_CUA_MODELS = [ + "openai/gpt-5.4-2026-03-05", "openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", "anthropic/claude-opus-4-5-20251101", @@ -563,10 +564,14 @@ export interface ResponseItem { export interface ComputerCallItem extends ResponseItem { type: "computer_call"; call_id: string; - action: { + action?: { type: string; [key: string]: unknown; }; + actions?: Array<{ + type: string; + [key: string]: unknown; + }>; pending_safety_checks?: Array<{ id: string; code: string; @@ -588,8 +593,9 @@ export type ResponseInputItem = call_id: string; output: | { - type: "input_image"; + type: "input_image" | "computer_screenshot"; image_url: string; + detail?: "original" | "high" | "low"; current_url?: string; error?: string; [key: string]: unknown; diff --git a/packages/core/tests/unit/public-api/llm-and-agents.test.ts b/packages/core/tests/unit/public-api/llm-and-agents.test.ts index cfec8a209..2e0bc9d6b 100644 --- a/packages/core/tests/unit/public-api/llm-and-agents.test.ts +++ b/packages/core/tests/unit/public-api/llm-and-agents.test.ts @@ -39,6 +39,7 @@ describe("LLM and Agents public API types", () => { const expectedModels = [ "openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", + "openai/gpt-5.4-2026-03-05", "anthropic/claude-opus-4-5-20251101", "anthropic/claude-opus-4-6", "anthropic/claude-sonnet-4-6",