Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 7 additions & 11 deletions packages/core/lib/v3/agent/tools/act.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@ import { tool } from "ai";
import { z } from "zod";
import type { V3 } from "../../v3.js";
import type { Action } from "../../types/public/methods.js";
import type { AgentModelConfig, Variables } from "../../types/public/agent.js";
import { TimeoutError } from "../../types/public/sdkErrors.js";
import type { AgentToolFactoryOptions } from "./types.js";

export const actTool = (
v3: V3,
executionModel?: string | AgentModelConfig,
variables?: Variables,
toolTimeout?: number,
) => {
export const actTool = (v3: V3, options: AgentToolFactoryOptions = {}) => {
const { executionModel, variables, toolTimeout, page } = options;
const hasVariables = variables && Object.keys(variables).length > 0;
const actionDescription = hasVariables
? `Describe what to click or type, e.g. "click the Login button" or "type %variableName% into the input". Available variables: ${Object.keys(variables).join(", ")}`
Expand All @@ -35,11 +31,11 @@ export const actTool = (
},
},
});
const options = executionModel
? { model: executionModel, variables, timeout: toolTimeout }
: { variables, timeout: toolTimeout };
const actOptions = executionModel
? { model: executionModel, variables, timeout: toolTimeout, page }
: { variables, timeout: toolTimeout, page };

const result = await v3.act(action, options);
const result = await v3.act(action, actOptions);
const actions = (result.actions as Action[] | undefined) ?? [];
v3.recordAgentReplayStep({
type: "act",
Expand Down
17 changes: 11 additions & 6 deletions packages/core/lib/v3/agent/tools/ariaTree.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import { tool } from "ai";
import { z } from "zod";
import type { V3 } from "../../v3.js";
import { resolvePage } from "../utils/resolvePage.js";
import { TimeoutError } from "../../types/public/sdkErrors.js";
import type { AgentToolFactoryOptions } from "./types.js";

export const ariaTreeTool = (v3: V3, toolTimeout?: number) =>
tool({
export const ariaTreeTool = (v3: V3, options: AgentToolFactoryOptions = {}) => {
const { toolTimeout, page } = options;

return tool({
description:
"gets the accessibility (ARIA) hybrid tree text for the current page. use this to understand structure and content.",
inputSchema: z.object({}),
Expand All @@ -15,14 +19,14 @@ export const ariaTreeTool = (v3: V3, toolTimeout?: number) =>
message: `Agent calling tool: ariaTree`,
level: 1,
});
const page = await v3.context.awaitActivePage();
const activePage = await resolvePage(v3, page);
const extractOptions = toolTimeout
? { timeout: toolTimeout }
: undefined;
? { timeout: toolTimeout, page: activePage }
: { page: activePage };
const { pageText } = (await v3.extract(extractOptions)) as {
pageText: string;
};
const pageUrl = page.url();
const pageUrl = activePage.url();

let content = pageText;
const MAX_TOKENS = 70000; // rough cap, assume ~4 chars per token for conservative truncation
Expand Down Expand Up @@ -74,3 +78,4 @@ export const ariaTreeTool = (v3: V3, toolTimeout?: number) =>
};
},
});
};
15 changes: 10 additions & 5 deletions packages/core/lib/v3/agent/tools/click.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@ import type {
import { processCoordinates } from "../utils/coordinateNormalization.js";
import { ensureXPath } from "../utils/xpath.js";
import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js";
import { resolvePage } from "../utils/resolvePage.js";
import type { AgentToolFactoryOptions } from "./types.js";

export const clickTool = (v3: V3, provider?: string) =>
tool({
export const clickTool = (v3: V3, options: AgentToolFactoryOptions = {}) => {
const { provider, page } = options;

return tool({
description:
"Click on an element using its coordinates (this is the most reliable way to click on an element, always use this over act, unless the element is not visible in the screenshot, but shown in ariaTree)",
inputSchema: z.object({
Expand All @@ -26,7 +30,7 @@ export const clickTool = (v3: V3, provider?: string) =>
}),
execute: async ({ describe, coordinates }): Promise<ClickToolResult> => {
try {
const page = await v3.context.awaitActivePage();
const activePage = await resolvePage(v3, page);
const processed = processCoordinates(
coordinates[0],
coordinates[1],
Expand All @@ -48,11 +52,11 @@ export const clickTool = (v3: V3, provider?: string) =>

// Only request XPath when caching is enabled to avoid unnecessary computation
const shouldCollectXpath = v3.isAgentReplayActive();
const xpath = await page.click(processed.x, processed.y, {
const xpath = await activePage.click(processed.x, processed.y, {
returnXpath: shouldCollectXpath,
});

const screenshotBase64 = await waitAndCaptureScreenshot(page);
const screenshotBase64 = await waitAndCaptureScreenshot(activePage);

// Record as an "act" step with proper Action for deterministic replay (only when caching)
if (shouldCollectXpath) {
Expand Down Expand Up @@ -114,3 +118,4 @@ export const clickTool = (v3: V3, provider?: string) =>
return { type: "content", value: content };
},
});
};
16 changes: 12 additions & 4 deletions packages/core/lib/v3/agent/tools/clickAndHold.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@ import type { V3 } from "../../v3.js";
import type { Action } from "../../types/public/methods.js";
import { processCoordinates } from "../utils/coordinateNormalization.js";
import { ensureXPath } from "../utils/xpath.js";
import { resolvePage } from "../utils/resolvePage.js";
import type { AgentToolFactoryOptions } from "./types.js";

export const clickAndHoldTool = (v3: V3, provider?: string) =>
tool({
export const clickAndHoldTool = (
v3: V3,
options: AgentToolFactoryOptions = {},
) => {
const { provider, page } = options;

return tool({
description: "Click and hold on an element using its coordinates",
inputSchema: z.object({
describe: z
Expand All @@ -23,7 +30,7 @@ export const clickAndHoldTool = (v3: V3, provider?: string) =>
}),
execute: async ({ describe, coordinates, duration }) => {
try {
const page = await v3.context.awaitActivePage();
const activePage = await resolvePage(v3, page);
const processed = processCoordinates(
coordinates[0],
coordinates[1],
Expand All @@ -50,7 +57,7 @@ export const clickAndHoldTool = (v3: V3, provider?: string) =>
const shouldCollectXpath = v3.isAgentReplayActive();

// Use dragAndDrop from same point to same point with delay to simulate click and hold
const [xpath] = await page.dragAndDrop(
const [xpath] = await activePage.dragAndDrop(
processed.x,
processed.y,
processed.x,
Expand Down Expand Up @@ -86,3 +93,4 @@ export const clickAndHoldTool = (v3: V3, provider?: string) =>
}
},
});
};
18 changes: 13 additions & 5 deletions packages/core/lib/v3/agent/tools/dragAndDrop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,16 @@ import type {
import { processCoordinates } from "../utils/coordinateNormalization.js";
import { ensureXPath } from "../utils/xpath.js";
import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js";
import { resolvePage } from "../utils/resolvePage.js";
import type { AgentToolFactoryOptions } from "./types.js";

export const dragAndDropTool = (v3: V3, provider?: string) =>
tool({
export const dragAndDropTool = (
v3: V3,
options: AgentToolFactoryOptions = {},
) => {
const { provider, page } = options;

return tool({
description:
"Drag and drop an element using its coordinates (this is the most reliable way to drag and drop an element, always use this over act, unless the element is not visible in the screenshot, but shown in ariaTree)",
inputSchema: z.object({
Expand All @@ -29,7 +36,7 @@ export const dragAndDropTool = (v3: V3, provider?: string) =>
endCoordinates,
}): Promise<DragAndDropToolResult> => {
try {
const page = await v3.context.awaitActivePage();
const activePage = await resolvePage(v3, page);
const processedStart = processCoordinates(
startCoordinates[0],
startCoordinates[1],
Expand Down Expand Up @@ -59,15 +66,15 @@ export const dragAndDropTool = (v3: V3, provider?: string) =>

// Only request XPath when caching is enabled to avoid unnecessary computation
const shouldCollectXpath = v3.isAgentReplayActive();
const [fromXpath, toXpath] = await page.dragAndDrop(
const [fromXpath, toXpath] = await activePage.dragAndDrop(
processedStart.x,
processedStart.y,
processedEnd.x,
processedEnd.y,
{ returnXpath: shouldCollectXpath },
);

const screenshotBase64 = await waitAndCaptureScreenshot(page);
const screenshotBase64 = await waitAndCaptureScreenshot(activePage);

// Record as "act" step with proper Action for deterministic replay (only when caching)
if (shouldCollectXpath) {
Expand Down Expand Up @@ -128,3 +135,4 @@ export const dragAndDropTool = (v3: V3, provider?: string) =>
return { type: "content", value: content };
},
});
};
24 changes: 12 additions & 12 deletions packages/core/lib/v3/agent/tools/extract.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { tool } from "ai";
import { z, ZodTypeAny } from "zod";
import type { V3 } from "../../v3.js";
import type { AgentModelConfig } from "../../types/public/agent.js";
import { TimeoutError } from "../../types/public/sdkErrors.js";
import type { AgentToolFactoryOptions } from "./types.js";

interface JsonSchema {
type?: string;
Expand Down Expand Up @@ -46,32 +46,30 @@ function jsonSchemaToZod(schema: JsonSchema): ZodTypeAny {
}
}

export const extractTool = (
v3: V3,
executionModel?: string | AgentModelConfig,
toolTimeout?: number,
) =>
tool({
export const extractTool = (v3: V3, options: AgentToolFactoryOptions = {}) => {
const { executionModel, toolTimeout, page } = options;

return tool({
description: `Extract structured data from the current page based on a provided schema.

USAGE GUIDELINES:
- Keep schemas MINIMAL - only include fields essential for the task
- IMPORTANT: only use this if explicitly asked for structured output. In most scenarios, you should use the aria tree tool over this.
- For URL fields, use format: "url"

EXAMPLES:
1. Extract a single value:
instruction: "extract the product price"
schema: { type: "object", properties: { price: { type: "number" } } }

2. Extract multiple fields:
instruction: "extract product name and price"
schema: { type: "object", properties: { name: { type: "string" }, price: { type: "number" } } }

3. Extract arrays:
instruction: "extract all product names and prices"
schema: { type: "object", properties: { products: { type: "array", items: { type: "object", properties: { name: { type: "string" }, price: { type: "number" } } } } } }

4. Extract a URL:
instruction: "extract the link"
schema: { type: "object", properties: { url: { type: "string", format: "url" } } }`,
Expand All @@ -97,6 +95,7 @@ export const extractTool = (
const result = await v3.extract(instruction, parsedSchema, {
...(executionModel ? { model: executionModel } : {}),
timeout: toolTimeout,
page,
});
return { success: true, result };
} catch (error) {
Expand All @@ -116,3 +115,4 @@ export const extractTool = (
}
},
});
};
18 changes: 11 additions & 7 deletions packages/core/lib/v3/agent/tools/fillFormVision.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,19 @@ import type { Action } from "../../types/public/methods.js";
import type {
FillFormVisionToolResult,
ModelOutputContentItem,
Variables,
} from "../../types/public/agent.js";
import { processCoordinates } from "../utils/coordinateNormalization.js";
import { ensureXPath } from "../utils/xpath.js";
import { waitAndCaptureScreenshot } from "../utils/screenshotHandler.js";
import { substituteVariables } from "../utils/variables.js";
import { resolvePage } from "../utils/resolvePage.js";
import type { AgentToolFactoryOptions } from "./types.js";

export const fillFormVisionTool = (
v3: V3,
provider?: string,
variables?: Variables,
options: AgentToolFactoryOptions = {},
) => {
const { provider, variables, page } = options;
const hasVariables = variables && Object.keys(variables).length > 0;
const valueDescription = hasVariables
? `Text to type into the target field. Use %variableName% to substitute a variable value. Available: ${Object.keys(variables).join(", ")}`
Expand Down Expand Up @@ -62,7 +63,7 @@ MANDATORY USE CASES (always use fillFormVision for these):
}),
execute: async ({ fields }): Promise<FillFormVisionToolResult> => {
try {
const page = await v3.context.awaitActivePage();
const activePage = await resolvePage(v3, page);

// Process coordinates and substitute variables for each field
// Keep original values (with %tokens%) for logging/caching, substituted values for typing
Expand Down Expand Up @@ -99,14 +100,14 @@ MANDATORY USE CASES (always use fillFormVision for these):

for (const field of processedFields) {
// Click the field, only requesting XPath when caching is enabled
const xpath = await page.click(
const xpath = await activePage.click(
field.coordinates.x,
field.coordinates.y,
{
returnXpath: shouldCollectXpath,
},
);
await page.type(field.value);
await activePage.type(field.value);

// Build Action with XPath for deterministic replay (only when caching)
// Use originalValue (with %tokens%) so cache stores references, not sensitive values
Expand All @@ -126,7 +127,10 @@ MANDATORY USE CASES (always use fillFormVision for these):
await new Promise((resolve) => setTimeout(resolve, 100));
}

const screenshotBase64 = await waitAndCaptureScreenshot(page, 100);
const screenshotBase64 = await waitAndCaptureScreenshot(
activePage,
100,
);

// Record as "act" step with proper Actions for deterministic replay (only when caching)
if (shouldCollectXpath && actions.length > 0) {
Expand Down
18 changes: 7 additions & 11 deletions packages/core/lib/v3/agent/tools/fillform.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@ import { tool } from "ai";
import { z } from "zod";
import type { V3 } from "../../v3.js";
import type { Action } from "../../types/public/methods.js";
import type { AgentModelConfig, Variables } from "../../types/public/agent.js";
import { TimeoutError } from "../../types/public/sdkErrors.js";
import type { AgentToolFactoryOptions } from "./types.js";

export const fillFormTool = (
v3: V3,
executionModel?: string | AgentModelConfig,
variables?: Variables,
toolTimeout?: number,
) => {
export const fillFormTool = (v3: V3, options: AgentToolFactoryOptions = {}) => {
const { executionModel, variables, toolTimeout, page } = options;
const hasVariables = variables && Object.keys(variables).length > 0;
const valueDescription = hasVariables
? `Text to type into the target. Use %variableName% to substitute a variable value. Available: ${Object.keys(variables).join(", ")}`
Expand Down Expand Up @@ -50,16 +46,16 @@ export const fillFormTool = (
.join(", ")}`;

const observeOptions = executionModel
? { model: executionModel, timeout: toolTimeout }
: { timeout: toolTimeout };
? { model: executionModel, timeout: toolTimeout, page }
: { timeout: toolTimeout, page };
const observeResults = await v3.observe(instruction, observeOptions);

const completed = [] as unknown[];
const replayableActions: Action[] = [];
for (const res of observeResults) {
const actOptions = variables
? { variables, timeout: toolTimeout }
: { timeout: toolTimeout };
? { variables, timeout: toolTimeout, page }
: { timeout: toolTimeout, page };
const actResult = await v3.act(res, actOptions);
completed.push(actResult);
if (Array.isArray(actResult.actions)) {
Expand Down
Loading
Loading