Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions packages/core/examples/gpt54-cua-example.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { Stagehand } from "../lib/v3/index.js";
import chalk from "chalk";

async function main() {
console.log(`\n${chalk.bold("Stagehand 🤘 GPT-5.4 CUA Demo")}\n`);

const stagehand = new Stagehand({
env: "LOCAL",
verbose: 2,
});
await stagehand.init();

try {
const page = stagehand.context.pages()[0];

const agent = stagehand.agent({
mode: "cua",
model: {
modelName: "openai/gpt-5.4-2026-03-05",
apiKey: process.env.OPENAI_API_KEY,
},
systemPrompt: `You are a helpful assistant that can use a web browser.
Do not ask follow up questions, the user will trust your judgement.
Today's date is ${new Date().toLocaleDateString()}.`,
});

await page.goto("https://news.ycombinator.com");

const instruction =
"Find the top story on Hacker News and tell me its title, link, and point count and then click on it and extract a summary for me";
console.log(`Instruction: ${chalk.white(instruction)}`);

const result = await agent.execute({
instruction,
maxSteps: 10,
});

console.log(`\n${chalk.green("✓")} Done`);
console.log(`${chalk.yellow("⤷")} ${result.message}`);
} catch (error) {
console.error(`${chalk.red("✗")} Error:`, error);
} finally {
await stagehand.close();
}
}

main();
1 change: 1 addition & 0 deletions packages/core/lib/v3/agent/AgentProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { MicrosoftCUAClient } from "./MicrosoftCUAClient.js";

// Map model names to their provider types
export const modelToAgentProviderMap: Record<string, AgentProviderType> = {
"gpt-5.4-2026-03-05": "openai",
"computer-use-preview": "openai",
"computer-use-preview-2025-03-11": "openai",
"claude-sonnet-4-20250514": "anthropic",
Expand Down
117 changes: 71 additions & 46 deletions packages/core/lib/v3/agent/OpenAICUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ export class OpenAICUAClient extends AgentClient {
private tools?: ToolSet;
private safetyConfirmationHandler?: SafetyConfirmationHandler;

private get usesNewComputerTool(): boolean {
return this.modelName.startsWith("gpt-5");
}

constructor(
type: AgentType,
modelName: string,
Expand Down Expand Up @@ -256,17 +260,12 @@ export class OpenAICUAClient extends AgentClient {
const stepActions: AgentAction[] = [];
for (const item of output) {
if (item.type === "computer_call" && this.isComputerCallItem(item)) {
logger({
category: "agent",
message: `Found computer_call: ${item.action.type}, call_id: ${item.call_id}`,
level: 2,
});
const action = this.convertComputerCallToAction(item);
if (action) {
const actions = this.convertComputerCallToActions(item);
for (const action of actions) {
stepActions.push(action);
logger({
category: "agent",
message: `Converted computer_call to action: ${action.type}`,
message: `Found computer_call action: ${action.type}, call_id: ${item.call_id}`,
level: 2,
});
}
Expand Down Expand Up @@ -350,8 +349,8 @@ export class OpenAICUAClient extends AgentClient {
return (
item.type === "computer_call" &&
"call_id" in item &&
"action" in item &&
typeof item.action === "object"
(("action" in item && typeof item.action === "object") ||
("actions" in item && Array.isArray(item.actions)))
);
}

Expand Down Expand Up @@ -426,19 +425,21 @@ export class OpenAICUAClient extends AgentClient {
usage: Record<string, number>;
}> {
try {
// Create the request parameters
const requestParams: Record<string, unknown> = {
model: this.modelName,
tools: [
{
type: "computer_use_preview",
// Create the request parameters, branching on tool format
const computerTool = this.usesNewComputerTool
? { type: "computer" as const }
: {
type: "computer_use_preview" as const,
display_width: this.currentViewport.width,
display_height: this.currentViewport.height,
environment: this.environment,
},
],
};

const requestParams: Record<string, unknown> = {
model: this.modelName,
tools: [computerTool],
input: inputItems,
truncation: "auto",
...(this.usesNewComputerTool ? {} : { truncation: "auto" }),
};

// Add custom tools if available
Expand Down Expand Up @@ -521,29 +522,36 @@ export class OpenAICUAClient extends AgentClient {
// Process each output item
for (const item of output) {
if (item.type === "computer_call" && this.isComputerCallItem(item)) {
// Handle computer calls
// Handle computer calls (both single-action and batched-actions formats)
try {
const action = this.convertComputerCallToAction(item);
const actions = this.convertComputerCallToActions(item);

if (action && this.actionHandler) {
logger({
category: "agent",
message: `Executing computer action: ${action.type}`,
level: 1,
});
await this.actionHandler(action);
if (this.actionHandler) {
for (const action of actions) {
logger({
category: "agent",
message: `Executing computer action: ${action.type}`,
level: 1,
});
await this.actionHandler(action);
}
}

// Capture a screenshot
// Capture a screenshot after all actions in the batch
const screenshot = await this.captureScreenshot();

// Create a computer_call_output for the next request
// Build the output — use "computer_screenshot" for new format, "input_image" for legacy
const outputType = this.usesNewComputerTool
? ("computer_screenshot" as const)
: ("input_image" as const);

const outputItem = {
type: "computer_call_output" as const,
call_id: item.call_id,
output: {
type: "input_image" as const,
type: outputType,
image_url: screenshot,
...(this.usesNewComputerTool ? { detail: "original" as const } : {}),
},
} as ResponseInputItem;

Expand All @@ -553,13 +561,13 @@ export class OpenAICUAClient extends AgentClient {
level: 2,
});

// Add current URL if available
if (this.currentUrl) {
// Legacy format supports current_url on the output; new format does not
if (!this.usesNewComputerTool && this.currentUrl) {
const computerCallOutput = outputItem as {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
current_url?: string;
};
Expand All @@ -582,7 +590,7 @@ export class OpenAICUAClient extends AgentClient {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
};
acknowledged_safety_checks?: SafetyCheck[];
Expand All @@ -607,26 +615,29 @@ export class OpenAICUAClient extends AgentClient {
});

try {
// Capture a screenshot even on error
const screenshot = await this.captureScreenshot();

const outputType = this.usesNewComputerTool
? ("computer_screenshot" as const)
: ("input_image" as const);

const errorOutputItem = {
type: "computer_call_output" as const,
call_id: item.call_id,
output: {
type: "input_image" as const,
type: outputType,
image_url: screenshot,
error: errorMessage,
...(this.usesNewComputerTool ? { detail: "original" as const } : {}),
},
} as ResponseInputItem;

// Add current URL if available
if (this.currentUrl) {
if (!this.usesNewComputerTool && this.currentUrl) {
const computerCallOutput = errorOutputItem as {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
current_url?: string;
};
Expand All @@ -649,7 +660,7 @@ export class OpenAICUAClient extends AgentClient {
type: "computer_call_output";
call_id: string;
output: {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
};
acknowledged_safety_checks?: SafetyCheck[];
Expand All @@ -664,14 +675,12 @@ export class OpenAICUAClient extends AgentClient {
if (screenshotError instanceof StagehandClosedError) {
throw screenshotError;
}
// If we can't capture a screenshot, just send the error
logger({
category: "agent",
message: `Error capturing screenshot: ${String(screenshotError)}`,
level: 0,
});

// For error cases without a screenshot, we need to use a string output
nextInputItems.push({
type: "computer_call_output",
call_id: item.call_id,
Expand Down Expand Up @@ -770,15 +779,31 @@ export class OpenAICUAClient extends AgentClient {
call: ComputerCallItem,
): AgentAction | null {
const { action } = call;
if (!action) return null;

// Instead of wrapping the action in a params object, spread the action properties directly
// This ensures properties like x, y, button, etc. are directly accessible on the AgentAction
return {
type: action.type as string,
...action, // Spread all properties from the action
...action,
};
}

/**
* Convert a batched computer_call (GPT-5.4 format) into multiple AgentActions.
*/
private convertComputerCallToActions(
call: ComputerCallItem,
): AgentAction[] {
if (call.actions && Array.isArray(call.actions)) {
return call.actions.map((action) => ({
type: action.type as string,
...action,
}));
}

const single = this.convertComputerCallToAction(call);
return single ? [single] : [];
}

private convertFunctionCallToAction(
call: FunctionCallItem,
): AgentAction | null {
Expand Down
10 changes: 8 additions & 2 deletions packages/core/lib/v3/types/public/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,7 @@ export type AgentType =
| "bedrock";

export const AVAILABLE_CUA_MODELS = [
"openai/gpt-5.4-2026-03-05",
"openai/computer-use-preview",
"openai/computer-use-preview-2025-03-11",
"anthropic/claude-opus-4-5-20251101",
Expand Down Expand Up @@ -563,10 +564,14 @@ export interface ResponseItem {
export interface ComputerCallItem extends ResponseItem {
type: "computer_call";
call_id: string;
action: {
action?: {
type: string;
[key: string]: unknown;
};
actions?: Array<{
type: string;
[key: string]: unknown;
}>;
pending_safety_checks?: Array<{
id: string;
code: string;
Expand All @@ -588,8 +593,9 @@ export type ResponseInputItem =
call_id: string;
output:
| {
type: "input_image";
type: "input_image" | "computer_screenshot";
image_url: string;
detail?: "original" | "high" | "low";
current_url?: string;
error?: string;
[key: string]: unknown;
Expand Down
1 change: 1 addition & 0 deletions packages/core/tests/unit/public-api/llm-and-agents.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ describe("LLM and Agents public API types", () => {
const expectedModels = [
"openai/computer-use-preview",
"openai/computer-use-preview-2025-03-11",
"openai/gpt-5.4-2026-03-05",
"anthropic/claude-opus-4-5-20251101",
"anthropic/claude-opus-4-6",
"anthropic/claude-sonnet-4-6",
Expand Down
Loading