Skip to content

Commit 0d9f8e4

Browse files
committed
move mode back to stagehand agent definition
1 parent 89cb6af commit 0d9f8e4

File tree

5 files changed

+49
-70
lines changed

5 files changed

+49
-70
lines changed

packages/core/lib/v3/handlers/v3AgentHandler.ts

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ export class V3AgentHandler {
4545
private executionModel?: string;
4646
private systemInstructions?: string;
4747
private mcpTools?: ToolSet;
48+
private mode: AgentToolMode;
4849

4950
constructor(
5051
v3: V3,
@@ -53,18 +54,19 @@ export class V3AgentHandler {
5354
executionModel?: string,
5455
systemInstructions?: string,
5556
mcpTools?: ToolSet,
57+
mode?: AgentToolMode,
5658
) {
5759
this.v3 = v3;
5860
this.logger = logger;
5961
this.llmClient = llmClient;
6062
this.executionModel = executionModel;
6163
this.systemInstructions = systemInstructions;
6264
this.mcpTools = mcpTools;
65+
this.mode = mode ?? "dom";
6366
}
6467

6568
private async prepareAgent(
6669
instructionOrOptions: string | AgentExecuteOptionsBase,
67-
effectiveMode?: AgentToolMode,
6870
): Promise<AgentContext> {
6971
try {
7072
const options =
@@ -74,22 +76,19 @@ export class V3AgentHandler {
7476

7577
const maxSteps = options.maxSteps || 20;
7678

77-
// Use effective mode passed from execute/stream, default to "dom"
78-
const mode = effectiveMode ?? "dom";
79-
8079
// Get the initial page URL first (needed for the system prompt)
8180
const initialPageUrl = (await this.v3.context.awaitActivePage()).url();
8281

8382
// Build the system prompt with mode-aware tool guidance
8483
const systemPrompt = buildAgentSystemPrompt({
8584
url: initialPageUrl,
8685
executionInstruction: options.instruction,
87-
mode,
86+
mode: this.mode,
8887
systemInstructions: this.systemInstructions,
8988
isBrowserbase: this.v3.isBrowserbase,
9089
});
9190

92-
const tools = this.createTools(mode);
91+
const tools = this.createTools();
9392
const allTools: ToolSet = { ...tools, ...this.mcpTools };
9493

9594
// Use provided messages for continuation, or start fresh with the instruction
@@ -212,12 +211,9 @@ export class V3AgentHandler {
212211
typeof instructionOrOptions === "object" ? instructionOrOptions : null;
213212
const signal = options?.signal;
214213

215-
// Determine effective mode from execute options, default to "dom"
216-
const effectiveMode = options?.mode ?? "dom";
217-
218214
// Highlight cursor defaults to true for hybrid mode, can be overridden
219215
const shouldHighlightCursor =
220-
options?.highlightCursor ?? effectiveMode === "hybrid";
216+
options?.highlightCursor ?? this.mode === "hybrid";
221217

222218
const state: AgentState = {
223219
collectedReasoning: [],
@@ -238,10 +234,10 @@ export class V3AgentHandler {
238234
messages: preparedMessages,
239235
wrappedModel,
240236
initialPageUrl,
241-
} = await this.prepareAgent(instructionOrOptions, effectiveMode);
237+
} = await this.prepareAgent(instructionOrOptions);
242238

243239
// Enable cursor overlay for hybrid mode (coordinate-based interactions)
244-
if (shouldHighlightCursor && effectiveMode === "hybrid") {
240+
if (shouldHighlightCursor && this.mode === "hybrid") {
245241
const page = await this.v3.context.awaitActivePage();
246242
await page.enableCursorOverlay().catch(() => {});
247243
}
@@ -321,12 +317,9 @@ export class V3AgentHandler {
321317
const streamOptions =
322318
typeof instructionOrOptions === "object" ? instructionOrOptions : null;
323319

324-
// Determine effective mode from stream options, default to "dom"
325-
const effectiveMode = streamOptions?.mode ?? "dom";
326-
327320
// Highlight cursor defaults to true for hybrid mode, can be overridden
328321
const shouldHighlightCursor =
329-
streamOptions?.highlightCursor ?? effectiveMode === "hybrid";
322+
streamOptions?.highlightCursor ?? this.mode === "hybrid";
330323

331324
const {
332325
options,
@@ -336,10 +329,10 @@ export class V3AgentHandler {
336329
messages,
337330
wrappedModel,
338331
initialPageUrl,
339-
} = await this.prepareAgent(instructionOrOptions, effectiveMode);
332+
} = await this.prepareAgent(instructionOrOptions);
340333

341334
// Enable cursor overlay for hybrid mode (coordinate-based interactions)
342-
if (shouldHighlightCursor && effectiveMode === "hybrid") {
335+
if (shouldHighlightCursor && this.mode === "hybrid") {
343336
const page = await this.v3.context.awaitActivePage();
344337
await page.enableCursorOverlay().catch(() => {});
345338
}
@@ -474,12 +467,12 @@ export class V3AgentHandler {
474467
};
475468
}
476469

477-
private createTools(mode?: AgentToolMode) {
470+
private createTools() {
478471
const provider = this.llmClient?.getLanguageModel?.()?.provider;
479472
return createAgentTools(this.v3, {
480473
executionModel: this.executionModel,
481474
logger: this.logger,
482-
mode: mode ?? "dom",
475+
mode: this.mode,
483476
provider,
484477
});
485478
}

packages/core/lib/v3/tests/agent-hybrid-mode.spec.ts

Lines changed: 27 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -158,72 +158,52 @@ test.describe("Stagehand agent hybrid mode", () => {
158158
});
159159
});
160160

161-
test.describe("Agent creation", () => {
162-
test("agent() creates agent with execute method", () => {
161+
test.describe("Agent creation with mode", () => {
162+
test("agent({ mode: 'dom' }) creates DOM-mode agent", () => {
163163
const agent = v3.agent({
164+
mode: "dom",
164165
model: "anthropic/claude-haiku-4-5-20251001",
165166
});
166167

167168
expect(agent).toHaveProperty("execute");
168169
});
169170

170-
test("agent with streaming enabled", () => {
171+
test("agent({ mode: 'hybrid' }) creates hybrid-mode agent", () => {
171172
const agent = v3.agent({
172-
stream: true,
173+
mode: "hybrid",
173174
model: "anthropic/claude-haiku-4-5-20251001",
174175
});
175176

176177
expect(agent).toHaveProperty("execute");
177178
});
178-
});
179-
180-
test.describe("Mode execution via execute options", () => {
181-
test("execute with mode: 'hybrid' uses coordinate-based tools", async () => {
182-
test.setTimeout(90000);
183-
184-
const toolCalls: Array<{ toolName: string; input: unknown }> = [];
185179

180+
test("agent without mode defaults to DOM mode", () => {
186181
const agent = v3.agent({
187182
model: "anthropic/claude-haiku-4-5-20251001",
188183
});
189184

190-
const page = v3.context.pages()[0];
191-
await page.goto("https://example.com");
185+
expect(agent).toHaveProperty("execute");
186+
});
192187

193-
await agent.execute({
194-
instruction:
195-
"Take a screenshot to see the page, then use close tool with taskComplete: true",
188+
test("hybrid mode can be combined with streaming", () => {
189+
const agent = v3.agent({
196190
mode: "hybrid",
197-
maxSteps: 5,
198-
callbacks: {
199-
onStepFinish: async (event: StepResult<ToolSet>) => {
200-
if (event.toolCalls) {
201-
for (const tc of event.toolCalls) {
202-
toolCalls.push({
203-
toolName: tc.toolName,
204-
input: tc.input,
205-
});
206-
}
207-
}
208-
},
209-
},
191+
stream: true,
192+
model: "anthropic/claude-haiku-4-5-20251001",
210193
});
211194

212-
// Should have captured tool calls
213-
expect(toolCalls.length).toBeGreaterThan(0);
214-
215-
const toolNames = toolCalls.map((tc) => tc.toolName);
216-
// Should include screenshot (hybrid mode emphasizes visual) and close
217-
expect(toolNames).toContain("screenshot");
218-
expect(toolNames).toContain("close");
195+
expect(agent).toHaveProperty("execute");
219196
});
197+
});
220198

221-
test("execute with mode: 'dom' uses DOM-based tools", async () => {
199+
test.describe("Hybrid mode execution", () => {
200+
test("hybrid mode agent uses coordinate-based tools when available", async () => {
222201
test.setTimeout(90000);
223202

224203
const toolCalls: Array<{ toolName: string; input: unknown }> = [];
225204

226205
const agent = v3.agent({
206+
mode: "hybrid",
227207
model: "anthropic/claude-haiku-4-5-20251001",
228208
});
229209

@@ -232,8 +212,7 @@ test.describe("Stagehand agent hybrid mode", () => {
232212

233213
await agent.execute({
234214
instruction:
235-
"Use the ariaTree to understand the page, then use close tool with taskComplete: true",
236-
mode: "dom",
215+
"Take a screenshot to see the page, then use close tool with taskComplete: true",
237216
maxSteps: 5,
238217
callbacks: {
239218
onStepFinish: async (event: StepResult<ToolSet>) => {
@@ -252,25 +231,28 @@ test.describe("Stagehand agent hybrid mode", () => {
252231
// Should have captured tool calls
253232
expect(toolCalls.length).toBeGreaterThan(0);
254233

255-
// Should include close
256234
const toolNames = toolCalls.map((tc) => tc.toolName);
235+
// Should include screenshot (hybrid mode emphasizes visual) and close
236+
expect(toolNames).toContain("screenshot");
257237
expect(toolNames).toContain("close");
258238
});
259239

260-
test("execute defaults to DOM mode when mode not specified", async () => {
240+
test("DOM mode agent uses DOM-based tools", async () => {
261241
test.setTimeout(90000);
262242

263243
const toolCalls: Array<{ toolName: string; input: unknown }> = [];
264244

265245
const agent = v3.agent({
246+
mode: "dom",
266247
model: "anthropic/claude-haiku-4-5-20251001",
267248
});
268249

269250
const page = v3.context.pages()[0];
270251
await page.goto("https://example.com");
271252

272253
await agent.execute({
273-
instruction: "Use close tool with taskComplete: true",
254+
instruction:
255+
"Use the ariaTree to understand the page, then use close tool with taskComplete: true",
274256
maxSteps: 5,
275257
callbacks: {
276258
onStepFinish: async (event: StepResult<ToolSet>) => {
@@ -286,7 +268,10 @@ test.describe("Stagehand agent hybrid mode", () => {
286268
},
287269
});
288270

271+
// Should have captured tool calls
289272
expect(toolCalls.length).toBeGreaterThan(0);
273+
274+
// Should include close
290275
const toolNames = toolCalls.map((tc) => tc.toolName);
291276
expect(toolNames).toContain("close");
292277
});

packages/core/lib/v3/types/public/agent.ts

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -216,12 +216,6 @@ export interface AgentExecuteOptionsBase {
216216
maxSteps?: number;
217217
page?: PlaywrightPage | PuppeteerPage | PatchrightPage | Page;
218218
highlightCursor?: boolean;
219-
/**
220-
* Tool mode for this execution. Overrides the mode set in AgentConfig.
221-
* - 'dom': Uses DOM-based tools (act, fillForm) for structured interactions
222-
* - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, etc.)
223-
*/
224-
mode?: AgentToolMode;
225219
/**
226220
* Previous conversation messages to continue from.
227221
* Pass the `messages` from a previous AgentResult to continue that conversation.
@@ -443,6 +437,13 @@ export type AgentConfig = {
443437
* When false (default), execute() returns AgentResult after completion.
444438
*/
445439
stream?: boolean;
440+
/**
441+
* Tool mode for the agent. Determines which set of tools are available.
442+
* - 'dom' (default): Uses DOM-based tools (act, fillForm) for structured interactions
443+
* - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, clickAndHold, fillFormVision)
444+
* for visual/screenshot-based interactions
445+
*/
446+
mode?: AgentToolMode;
446447
};
447448

448449
/**

packages/core/lib/v3/v3.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1611,6 +1611,7 @@ export class V3 {
16111611
: options?.executionModel?.modelName,
16121612
options?.systemPrompt,
16131613
tools,
1614+
options?.mode,
16141615
);
16151616

16161617
const resolvedOptions: AgentExecuteOptions | AgentStreamExecuteOptions =

packages/core/tests/public-api/public-types.test.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ describe("Stagehand public API types", () => {
191191
maxSteps?: number;
192192
page?: Stagehand.AnyPage;
193193
highlightCursor?: boolean;
194-
mode?: Stagehand.AgentToolMode;
195194
messages?: Stagehand.ModelMessage[];
196195
signal?: AbortSignal;
197196
callbacks?: Stagehand.AgentExecuteCallbacks;
@@ -208,7 +207,6 @@ describe("Stagehand public API types", () => {
208207
maxSteps?: number;
209208
page?: Stagehand.AnyPage;
210209
highlightCursor?: boolean;
211-
mode?: Stagehand.AgentToolMode;
212210
messages?: Stagehand.ModelMessage[];
213211
signal?: AbortSignal;
214212
callbacks?: Stagehand.AgentStreamCallbacks;
@@ -266,6 +264,7 @@ describe("Stagehand public API types", () => {
266264
model?: string | Stagehand.AgentModelConfig<string>;
267265
executionModel?: string | Stagehand.AgentModelConfig<string>;
268266
stream?: boolean;
267+
mode?: Stagehand.AgentToolMode;
269268
};
270269

271270
it("matches expected type shape", () => {

0 commit comments

Comments
 (0)