move mode back to stagehand agent definition

tkattkat · tkattkat · commit 0d9f8e47956d · 2025-12-17T16:02:15.000-08:00
diff --git a/packages/core/lib/v3/handlers/v3AgentHandler.ts b/packages/core/lib/v3/handlers/v3AgentHandler.ts
@@ -45,6 +45,7 @@ export class V3AgentHandler {
   private executionModel?: string;
   private systemInstructions?: string;
   private mcpTools?: ToolSet;
+  private mode: AgentToolMode;
 
   constructor(
     v3: V3,
@@ -53,18 +54,19 @@ export class V3AgentHandler {
     executionModel?: string,
     systemInstructions?: string,
     mcpTools?: ToolSet,
+    mode?: AgentToolMode,
   ) {
     this.v3 = v3;
     this.logger = logger;
     this.llmClient = llmClient;
     this.executionModel = executionModel;
     this.systemInstructions = systemInstructions;
     this.mcpTools = mcpTools;
+    this.mode = mode ?? "dom";
   }
 
   private async prepareAgent(
     instructionOrOptions: string | AgentExecuteOptionsBase,
-    effectiveMode?: AgentToolMode,
   ): Promise<AgentContext> {
     try {
       const options =
@@ -74,22 +76,19 @@ export class V3AgentHandler {
 
       const maxSteps = options.maxSteps || 20;
 
-      // Use effective mode passed from execute/stream, default to "dom"
-      const mode = effectiveMode ?? "dom";
-
       // Get the initial page URL first (needed for the system prompt)
       const initialPageUrl = (await this.v3.context.awaitActivePage()).url();
 
       // Build the system prompt with mode-aware tool guidance
       const systemPrompt = buildAgentSystemPrompt({
         url: initialPageUrl,
         executionInstruction: options.instruction,
-        mode,
+        mode: this.mode,
         systemInstructions: this.systemInstructions,
         isBrowserbase: this.v3.isBrowserbase,
       });
 
-      const tools = this.createTools(mode);
+      const tools = this.createTools();
       const allTools: ToolSet = { ...tools, ...this.mcpTools };
 
       // Use provided messages for continuation, or start fresh with the instruction
@@ -212,12 +211,9 @@ export class V3AgentHandler {
       typeof instructionOrOptions === "object" ? instructionOrOptions : null;
     const signal = options?.signal;
 
-    // Determine effective mode from execute options, default to "dom"
-    const effectiveMode = options?.mode ?? "dom";
-
     // Highlight cursor defaults to true for hybrid mode, can be overridden
     const shouldHighlightCursor =
-      options?.highlightCursor ?? effectiveMode === "hybrid";
+      options?.highlightCursor ?? this.mode === "hybrid";
 
     const state: AgentState = {
       collectedReasoning: [],
@@ -238,10 +234,10 @@ export class V3AgentHandler {
         messages: preparedMessages,
         wrappedModel,
         initialPageUrl,
-      } = await this.prepareAgent(instructionOrOptions, effectiveMode);
+      } = await this.prepareAgent(instructionOrOptions);
 
       // Enable cursor overlay for hybrid mode (coordinate-based interactions)
-      if (shouldHighlightCursor && effectiveMode === "hybrid") {
+      if (shouldHighlightCursor && this.mode === "hybrid") {
         const page = await this.v3.context.awaitActivePage();
         await page.enableCursorOverlay().catch(() => {});
       }
@@ -321,12 +317,9 @@ export class V3AgentHandler {
     const streamOptions =
       typeof instructionOrOptions === "object" ? instructionOrOptions : null;
 
-    // Determine effective mode from stream options, default to "dom"
-    const effectiveMode = streamOptions?.mode ?? "dom";
-
     // Highlight cursor defaults to true for hybrid mode, can be overridden
     const shouldHighlightCursor =
-      streamOptions?.highlightCursor ?? effectiveMode === "hybrid";
+      streamOptions?.highlightCursor ?? this.mode === "hybrid";
 
     const {
       options,
@@ -336,10 +329,10 @@ export class V3AgentHandler {
       messages,
       wrappedModel,
       initialPageUrl,
-    } = await this.prepareAgent(instructionOrOptions, effectiveMode);
+    } = await this.prepareAgent(instructionOrOptions);
 
     // Enable cursor overlay for hybrid mode (coordinate-based interactions)
-    if (shouldHighlightCursor && effectiveMode === "hybrid") {
+    if (shouldHighlightCursor && this.mode === "hybrid") {
       const page = await this.v3.context.awaitActivePage();
       await page.enableCursorOverlay().catch(() => {});
     }
@@ -474,12 +467,12 @@ export class V3AgentHandler {
     };
   }
 
-  private createTools(mode?: AgentToolMode) {
+  private createTools() {
     const provider = this.llmClient?.getLanguageModel?.()?.provider;
     return createAgentTools(this.v3, {
       executionModel: this.executionModel,
       logger: this.logger,
-      mode: mode ?? "dom",
+      mode: this.mode,
       provider,
     });
   }
diff --git a/packages/core/lib/v3/tests/agent-hybrid-mode.spec.ts b/packages/core/lib/v3/tests/agent-hybrid-mode.spec.ts
@@ -158,72 +158,52 @@ test.describe("Stagehand agent hybrid mode", () => {
     });
   });
 
-  test.describe("Agent creation", () => {
-    test("agent() creates agent with execute method", () => {
+  test.describe("Agent creation with mode", () => {
+    test("agent({ mode: 'dom' }) creates DOM-mode agent", () => {
       const agent = v3.agent({
+        mode: "dom",
         model: "anthropic/claude-haiku-4-5-20251001",
       });
 
       expect(agent).toHaveProperty("execute");
     });
 
-    test("agent with streaming enabled", () => {
+    test("agent({ mode: 'hybrid' }) creates hybrid-mode agent", () => {
       const agent = v3.agent({
-        stream: true,
+        mode: "hybrid",
         model: "anthropic/claude-haiku-4-5-20251001",
       });
 
       expect(agent).toHaveProperty("execute");
     });
-  });
-
-  test.describe("Mode execution via execute options", () => {
-    test("execute with mode: 'hybrid' uses coordinate-based tools", async () => {
-      test.setTimeout(90000);
-
-      const toolCalls: Array<{ toolName: string; input: unknown }> = [];
 
+    test("agent without mode defaults to DOM mode", () => {
       const agent = v3.agent({
         model: "anthropic/claude-haiku-4-5-20251001",
       });
 
-      const page = v3.context.pages()[0];
-      await page.goto("https://example.com");
+      expect(agent).toHaveProperty("execute");
+    });
 
-      await agent.execute({
-        instruction:
-          "Take a screenshot to see the page, then use close tool with taskComplete: true",
+    test("hybrid mode can be combined with streaming", () => {
+      const agent = v3.agent({
         mode: "hybrid",
-        maxSteps: 5,
-        callbacks: {
-          onStepFinish: async (event: StepResult<ToolSet>) => {
-            if (event.toolCalls) {
-              for (const tc of event.toolCalls) {
-                toolCalls.push({
-                  toolName: tc.toolName,
-                  input: tc.input,
-                });
-              }
-            }
-          },
-        },
+        stream: true,
+        model: "anthropic/claude-haiku-4-5-20251001",
       });
 
-      // Should have captured tool calls
-      expect(toolCalls.length).toBeGreaterThan(0);
-
-      const toolNames = toolCalls.map((tc) => tc.toolName);
-      // Should include screenshot (hybrid mode emphasizes visual) and close
-      expect(toolNames).toContain("screenshot");
-      expect(toolNames).toContain("close");
+      expect(agent).toHaveProperty("execute");
     });
+  });
 
-    test("execute with mode: 'dom' uses DOM-based tools", async () => {
+  test.describe("Hybrid mode execution", () => {
+    test("hybrid mode agent uses coordinate-based tools when available", async () => {
       test.setTimeout(90000);
 
       const toolCalls: Array<{ toolName: string; input: unknown }> = [];
 
       const agent = v3.agent({
+        mode: "hybrid",
         model: "anthropic/claude-haiku-4-5-20251001",
       });
 
@@ -232,8 +212,7 @@ test.describe("Stagehand agent hybrid mode", () => {
 
       await agent.execute({
         instruction:
-          "Use the ariaTree to understand the page, then use close tool with taskComplete: true",
-        mode: "dom",
+          "Take a screenshot to see the page, then use close tool with taskComplete: true",
         maxSteps: 5,
         callbacks: {
           onStepFinish: async (event: StepResult<ToolSet>) => {
@@ -252,25 +231,28 @@ test.describe("Stagehand agent hybrid mode", () => {
       // Should have captured tool calls
       expect(toolCalls.length).toBeGreaterThan(0);
 
-      // Should include close
       const toolNames = toolCalls.map((tc) => tc.toolName);
+      // Should include screenshot (hybrid mode emphasizes visual) and close
+      expect(toolNames).toContain("screenshot");
       expect(toolNames).toContain("close");
     });
 
-    test("execute defaults to DOM mode when mode not specified", async () => {
+    test("DOM mode agent uses DOM-based tools", async () => {
       test.setTimeout(90000);
 
       const toolCalls: Array<{ toolName: string; input: unknown }> = [];
 
       const agent = v3.agent({
+        mode: "dom",
         model: "anthropic/claude-haiku-4-5-20251001",
       });
 
       const page = v3.context.pages()[0];
       await page.goto("https://example.com");
 
       await agent.execute({
-        instruction: "Use close tool with taskComplete: true",
+        instruction:
+          "Use the ariaTree to understand the page, then use close tool with taskComplete: true",
         maxSteps: 5,
         callbacks: {
           onStepFinish: async (event: StepResult<ToolSet>) => {
@@ -286,7 +268,10 @@ test.describe("Stagehand agent hybrid mode", () => {
         },
       });
 
+      // Should have captured tool calls
       expect(toolCalls.length).toBeGreaterThan(0);
+
+      // Should include close
       const toolNames = toolCalls.map((tc) => tc.toolName);
       expect(toolNames).toContain("close");
     });
diff --git a/packages/core/lib/v3/types/public/agent.ts b/packages/core/lib/v3/types/public/agent.ts
@@ -216,12 +216,6 @@ export interface AgentExecuteOptionsBase {
   maxSteps?: number;
   page?: PlaywrightPage | PuppeteerPage | PatchrightPage | Page;
   highlightCursor?: boolean;
-  /**
-   * Tool mode for this execution. Overrides the mode set in AgentConfig.
-   * - 'dom': Uses DOM-based tools (act, fillForm) for structured interactions
-   * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, etc.)
-   */
-  mode?: AgentToolMode;
   /**
    * Previous conversation messages to continue from.
    * Pass the `messages` from a previous AgentResult to continue that conversation.
@@ -443,6 +437,13 @@ export type AgentConfig = {
    * When false (default), execute() returns AgentResult after completion.
    */
   stream?: boolean;
+  /**
+   * Tool mode for the agent. Determines which set of tools are available.
+   * - 'dom' (default): Uses DOM-based tools (act, fillForm) for structured interactions
+   * - 'hybrid': Uses coordinate-based tools (click, type, dragAndDrop, clickAndHold, fillFormVision)
+   *             for visual/screenshot-based interactions
+   */
+  mode?: AgentToolMode;
 };
 
 /**
diff --git a/packages/core/lib/v3/v3.ts b/packages/core/lib/v3/v3.ts
@@ -1611,6 +1611,7 @@ export class V3 {
         : options?.executionModel?.modelName,
       options?.systemPrompt,
       tools,
+      options?.mode,
     );
 
     const resolvedOptions: AgentExecuteOptions | AgentStreamExecuteOptions =
diff --git a/packages/core/tests/public-api/public-types.test.ts b/packages/core/tests/public-api/public-types.test.ts
@@ -191,7 +191,6 @@ describe("Stagehand public API types", () => {
       maxSteps?: number;
       page?: Stagehand.AnyPage;
       highlightCursor?: boolean;
-      mode?: Stagehand.AgentToolMode;
       messages?: Stagehand.ModelMessage[];
       signal?: AbortSignal;
       callbacks?: Stagehand.AgentExecuteCallbacks;
@@ -208,7 +207,6 @@ describe("Stagehand public API types", () => {
       maxSteps?: number;
       page?: Stagehand.AnyPage;
       highlightCursor?: boolean;
-      mode?: Stagehand.AgentToolMode;
       messages?: Stagehand.ModelMessage[];
       signal?: AbortSignal;
       callbacks?: Stagehand.AgentStreamCallbacks;
@@ -266,6 +264,7 @@ describe("Stagehand public API types", () => {
       model?: string | Stagehand.AgentModelConfig<string>;
       executionModel?: string | Stagehand.AgentModelConfig<string>;
       stream?: boolean;
+      mode?: Stagehand.AgentToolMode;
     };
 
     it("matches expected type shape", () => {