Skip to content

Commit 05f5580

Browse files
authored
update agent cache handling (#1431)
# why add support for keys cache # what changed save click , and type actions in act cache for agent # test plan ran on sign in example with click type and keys , re ran using cache <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Makes agent actions deterministic and cacheable by recording XPath-based “act” steps for clicks, typing, drag-and-drop, click-and-hold, and form fills, and adds replay support for key events. This improves cache hit rates and makes replays more reliable. - New Features - Record tools as “act” steps with Action objects (method, selector, args) using XPath captured via returnXpath. - Add AgentReplayKeysStep and AgentCache logic to replay type/press (with repeat support). - Introduce ensureXPath utility and use it across tools and v3CuaAgentHandler to normalize selectors. <sup>Written for commit f62c537. Summary will update automatically on new commits.</sup> <!-- End of auto-generated description by cubic. -->
1 parent fea1700 commit 05f5580

File tree

10 files changed

+200
-54
lines changed

10 files changed

+200
-54
lines changed

.changeset/red-boxes-occur.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand": patch
3+
---
4+
5+
Update the cache handling for agent

packages/core/lib/v3/agent/tools/click.ts

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import { tool } from "ai";
22
import { z } from "zod";
33
import type { V3 } from "../../v3";
4+
import type { Action } from "../../types/public/methods";
45
import {
56
processCoordinates,
67
isGoogleProvider,
78
} from "../utils/coordinateNormalization";
9+
import { ensureXPath } from "../utils/xpath";
810

911
function waitForTimeout(ms: number) {
1012
return new Promise((resolve) => setTimeout(resolve, ms));
@@ -44,16 +46,34 @@ export const clickTool = (v3: V3, provider?: string) =>
4446
},
4547
},
4648
});
47-
await page.click(processed.x, processed.y);
49+
50+
// Use returnXpath to get the XPath of the clicked element for caching
51+
const xpath = await page.click(processed.x, processed.y, {
52+
returnXpath: true,
53+
});
54+
4855
// Google models need extra delay for page to settle after click
4956
if (isGoogleProvider(provider)) {
5057
await waitForTimeout(1000);
5158
}
52-
v3.recordAgentReplayStep({
53-
type: "click",
54-
instruction: describe,
55-
playwrightArguments: { coordinates: [processed.x, processed.y] },
56-
});
59+
60+
// Record as an "act" step with proper Action for deterministic replay
61+
const normalizedXpath = ensureXPath(xpath);
62+
if (normalizedXpath) {
63+
const action: Action = {
64+
selector: normalizedXpath,
65+
description: describe,
66+
method: "click",
67+
arguments: [],
68+
};
69+
v3.recordAgentReplayStep({
70+
type: "act",
71+
instruction: describe,
72+
actions: [action],
73+
actionDescription: describe,
74+
});
75+
}
76+
5777
return {
5878
success: true,
5979
describe,

packages/core/lib/v3/agent/tools/clickAndHold.ts

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import { tool } from "ai";
22
import { z } from "zod";
33
import type { V3 } from "../../v3";
4+
import type { Action } from "../../types/public/methods";
45
import { processCoordinates } from "../utils/coordinateNormalization";
6+
import { ensureXPath } from "../utils/xpath";
57

68
export const clickAndHoldTool = (v3: V3, provider?: string) =>
79
tool({
@@ -44,22 +46,34 @@ export const clickAndHoldTool = (v3: V3, provider?: string) =>
4446
},
4547
},
4648
});
49+
4750
// Use dragAndDrop from same point to same point with delay to simulate click and hold
48-
await page.dragAndDrop(
51+
// returnXpath gives us the xpath of the element at that position
52+
const [xpath] = await page.dragAndDrop(
4953
processed.x,
5054
processed.y,
5155
processed.x,
5256
processed.y,
53-
{ delay: duration },
57+
{ delay: duration, returnXpath: true },
5458
);
55-
v3.recordAgentReplayStep({
56-
type: "clickAndHold",
57-
instruction: describe,
58-
playwrightArguments: {
59-
coordinates: [processed.x, processed.y],
60-
duration,
61-
},
62-
});
59+
60+
// Record as "act" step with proper Action for deterministic replay
61+
const normalizedXpath = ensureXPath(xpath);
62+
if (normalizedXpath) {
63+
const action: Action = {
64+
selector: normalizedXpath,
65+
description: describe,
66+
method: "clickAndHold",
67+
arguments: [String(duration)],
68+
};
69+
v3.recordAgentReplayStep({
70+
type: "act",
71+
instruction: describe,
72+
actions: [action],
73+
actionDescription: describe,
74+
});
75+
}
76+
6377
return { success: true, describe };
6478
} catch (error) {
6579
return {

packages/core/lib/v3/agent/tools/dragAndDrop.ts

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import { tool } from "ai";
22
import { z } from "zod";
33
import type { V3 } from "../../v3";
4+
import type { Action } from "../../types/public/methods";
45
import { processCoordinates } from "../utils/coordinateNormalization";
6+
import { ensureXPath } from "../utils/xpath";
57

68
export const dragAndDropTool = (v3: V3, provider?: string) =>
79
tool({
@@ -47,20 +49,34 @@ export const dragAndDropTool = (v3: V3, provider?: string) =>
4749
},
4850
},
4951
});
50-
await page.dragAndDrop(
52+
53+
// Use returnXpath to get XPaths of both start and end elements for caching
54+
const [fromXpath, toXpath] = await page.dragAndDrop(
5155
processedStart.x,
5256
processedStart.y,
5357
processedEnd.x,
5458
processedEnd.y,
59+
{ returnXpath: true },
5560
);
56-
v3.recordAgentReplayStep({
57-
type: "dragAndDrop",
58-
instruction: describe,
59-
playwrightArguments: {
60-
startCoordinates: [processedStart.x, processedStart.y],
61-
endCoordinates: [processedEnd.x, processedEnd.y],
62-
},
63-
});
61+
62+
// Record as "act" step with proper Action for deterministic replay
63+
const normalizedFrom = ensureXPath(fromXpath);
64+
const normalizedTo = ensureXPath(toXpath);
65+
if (normalizedFrom && normalizedTo) {
66+
const action: Action = {
67+
selector: normalizedFrom,
68+
description: describe,
69+
method: "dragAndDrop",
70+
arguments: [normalizedTo],
71+
};
72+
v3.recordAgentReplayStep({
73+
type: "act",
74+
instruction: describe,
75+
actions: [action],
76+
actionDescription: describe,
77+
});
78+
}
79+
6480
return { success: true, describe };
6581
} catch (error) {
6682
return {

packages/core/lib/v3/agent/tools/fillFormVision.ts

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import { tool } from "ai";
22
import { z } from "zod";
33
import type { V3 } from "../../v3";
4+
import type { Action } from "../../types/public/methods";
45
import { processCoordinates } from "../utils/coordinateNormalization";
6+
import { ensureXPath } from "../utils/xpath";
57

68
export const fillFormVisionTool = (v3: V3, provider?: string) =>
79
tool({
@@ -71,18 +73,44 @@ MANDATORY USE CASES (always use fillFormVision for these):
7173
},
7274
});
7375

76+
// Collect actions with XPaths for cache replay
77+
const actions: Action[] = [];
78+
7479
for (const field of processedFields) {
75-
await page.click(field.coordinates.x, field.coordinates.y);
80+
// Click the field with returnXpath to get the element's XPath
81+
const xpath = await page.click(
82+
field.coordinates.x,
83+
field.coordinates.y,
84+
{
85+
returnXpath: true,
86+
},
87+
);
7688
await page.type(field.value);
89+
90+
// Build Action with XPath for deterministic replay
91+
const normalizedXpath = ensureXPath(xpath);
92+
if (normalizedXpath) {
93+
actions.push({
94+
selector: normalizedXpath,
95+
description: field.action,
96+
method: "type",
97+
arguments: [field.value],
98+
});
99+
}
100+
77101
// Small delay between fields
78102
await new Promise((resolve) => setTimeout(resolve, 100));
79103
}
80104

81-
v3.recordAgentReplayStep({
82-
type: "fillFormVision",
83-
instruction: `Fill ${fields.length} form fields`,
84-
playwrightArguments: processedFields,
85-
});
105+
// Record as "act" step with proper Actions for deterministic replay
106+
if (actions.length > 0) {
107+
v3.recordAgentReplayStep({
108+
type: "act",
109+
instruction: `Fill ${fields.length} form fields`,
110+
actions,
111+
actionDescription: `Fill ${fields.length} form fields`,
112+
});
113+
}
86114

87115
return {
88116
success: true,

packages/core/lib/v3/agent/tools/type.ts

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import { tool } from "ai";
22
import { z } from "zod";
33
import type { V3 } from "../../v3";
4+
import type { Action } from "../../types/public/methods";
45
import {
56
processCoordinates,
67
isGoogleProvider,
78
} from "../utils/coordinateNormalization";
9+
import { ensureXPath } from "../utils/xpath";
810

911
function waitForTimeout(ms: number) {
1012
return new Promise((resolve) => setTimeout(resolve, ms));
@@ -45,21 +47,36 @@ export const typeTool = (v3: V3, provider?: string) =>
4547
},
4648
},
4749
});
48-
// Click the element first, then type
49-
await page.click(processed.x, processed.y);
50+
51+
// Click the element first with returnXpath to get the element's XPath
52+
const xpath = await page.click(processed.x, processed.y, {
53+
returnXpath: true,
54+
});
55+
5056
// Google models need extra delay for page to settle after click
5157
if (isGoogleProvider(provider)) {
5258
await waitForTimeout(1000);
5359
}
60+
5461
await page.type(text);
55-
v3.recordAgentReplayStep({
56-
type: "type",
57-
instruction: describe,
58-
playwrightArguments: {
59-
coordinates: [processed.x, processed.y],
60-
text,
61-
},
62-
});
62+
63+
// Record as an "act" step with proper Action for deterministic replay
64+
const normalizedXpath = ensureXPath(xpath);
65+
if (normalizedXpath) {
66+
const action: Action = {
67+
selector: normalizedXpath,
68+
description: describe,
69+
method: "type",
70+
arguments: [text],
71+
};
72+
v3.recordAgentReplayStep({
73+
type: "act",
74+
instruction: describe,
75+
actions: [action],
76+
actionDescription: describe,
77+
});
78+
}
79+
6380
return { success: true, describe, text };
6481
} catch (error) {
6582
return {
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/**
2+
* Utility functions for XPath handling in agent tools.
3+
*/
4+
5+
/**
6+
* Ensures a value is properly formatted as an XPath selector.
7+
* Returns null if the value is not a valid string.
8+
*
9+
* @param value - The value to normalize as an XPath
10+
* @returns The normalized XPath string prefixed with "xpath=" or null
11+
*/
12+
export function ensureXPath(value: unknown): string | null {
13+
if (typeof value !== "string") return null;
14+
const trimmed = value.trim();
15+
if (!trimmed) return null;
16+
return trimmed.startsWith("xpath=") ? trimmed : `xpath=${trimmed}`;
17+
}

packages/core/lib/v3/cache/AgentCache.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import type {
55
AgentReplayActStep,
66
AgentReplayFillFormStep,
77
AgentReplayGotoStep,
8+
AgentReplayKeysStep,
89
AgentReplayNavBackStep,
910
AgentReplayScrollStep,
1011
AgentReplayStep,
@@ -554,6 +555,9 @@ export class AgentCache {
554555
case "navback":
555556
await this.replayAgentNavBackStep(step as AgentReplayNavBackStep, ctx);
556557
return;
558+
case "keys":
559+
await this.replayAgentKeysStep(step as AgentReplayKeysStep, ctx);
560+
return;
557561
case "close":
558562
case "extract":
559563
case "screenshot":
@@ -654,4 +658,23 @@ export class AgentCache {
654658
const page = await ctx.awaitActivePage();
655659
await page.goBack({ waitUntil: step.waitUntil ?? "domcontentloaded" });
656660
}
661+
662+
private async replayAgentKeysStep(
663+
step: AgentReplayKeysStep,
664+
ctx: V3Context,
665+
): Promise<void> {
666+
const page = await ctx.awaitActivePage();
667+
const { method, text, keys, times } = step.playwrightArguments;
668+
const repeatCount = Math.max(1, times ?? 1);
669+
670+
if (method === "type" && text) {
671+
for (let i = 0; i < repeatCount; i++) {
672+
await page.type(text, { delay: 100 });
673+
}
674+
} else if (method === "press" && keys) {
675+
for (let i = 0; i < repeatCount; i++) {
676+
await page.keyPress(keys, { delay: 100 });
677+
}
678+
}
679+
}
657680
}

0 commit comments

Comments
 (0)