Update the agent evals cli (#1364)

tkattkat · web-flow · commit ca0630e4d96b · 2025-12-04T16:42:28.000-08:00
# why After the transition to v3, the model handling for agent evals was not updated to account for new model formats # what changed - added isCua flag and two separate model maps to allow for models that can be ran with cua and non - adjusted model handling to properly parse cua models - added tag to distinguish if the run is using cua or non # test plan - tested evals for cua, and non cua  --- ## Summary by cubic Updated the agent evals CLI to support and correctly run both CUA and non-CUA agent models in v3. Fixes agent model parsing and enables mixed eval runs. - **New Features** - Split agent models into standard and CUA lists; added getAgentModelEntries with a cua flag. - Passed isCUA through EvalInput to initV3 and tasks; selects a safe internal model for handlers when CUA. - Improved provider lookup and error messages for CUA models using short names; testcases now tag models as "cua" or "agent". <sup>Written for commit 13b906c. Summary will update automatically on new commits.</sup>
diff --git a/.changeset/fruity-hounds-rush.md b/.changeset/fruity-hounds-rush.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand-evals": patch
+---
+
+Update model handling in agent evals cli
diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts
@@ -21,7 +21,12 @@ import {
 } from "./args";
 import { generateExperimentName } from "./utils";
 import { exactMatch, errorMatch } from "./scoring";
-import { tasksByName, tasksConfig, getModelList } from "./taskConfig";
+import {
+  tasksByName,
+  tasksConfig,
+  getModelList,
+  getAgentModelEntries,
+} from "./taskConfig";
 import { Eval } from "braintrust";
 import { SummaryResult, Testcase, EvalInput } from "./types/evals";
 import { EvalLogger } from "./logger";
@@ -171,19 +176,33 @@ const generateFilteredTestcases = (): Testcase[] => {
   }
 
   // Create a list of all remaining testcases using the determined task names and models
-  const regularTestcases = currentModels.flatMap((model) =>
+  const isAgentCategory =
+    effectiveCategory === "agent" ||
+    effectiveCategory === "external_agent_benchmarks";
+
+  // Use agent model entries (with cua flag) for agent categories, otherwise map currentModels
+  const modelEntries = isAgentCategory
+    ? getAgentModelEntries()
+    : currentModels.map((m) => ({ modelName: m, cua: false }));
+
+  const regularTestcases = modelEntries.flatMap((entry) =>
     taskNamesToRun.map((testName) => ({
-      input: { name: testName, modelName: model as AvailableModel },
+      input: {
+        name: testName,
+        modelName: entry.modelName as AvailableModel,
+        ...(isAgentCategory && { isCUA: entry.cua }),
+      },
       name: testName,
       tags: [
-        model,
+        entry.modelName,
+        ...(isAgentCategory ? [entry.cua ? "cua" : "agent"] : []),
         testName,
         ...(tasksConfig.find((t) => t.name === testName)?.categories || []).map(
           (x) => `category/${x}`,
         ),
       ],
       metadata: {
-        model: model as AvailableModel,
+        model: entry.modelName as AvailableModel,
         test: testName,
       },
       expected: true,
@@ -344,6 +363,7 @@ const generateFilteredTestcases = (): Testcase[] => {
               modelName: input.modelName,
               modelClientOptions: { apiKey: apiKey },
               createAgent: isAgentTask,
+              isCUA: input.isCUA,
             });
           } else {
             let llmClient: LLMClient;
@@ -360,6 +380,7 @@ const generateFilteredTestcases = (): Testcase[] => {
               llmClient,
               modelName: input.modelName,
               createAgent: isAgentTask,
+              isCUA: input.isCUA,
             });
           }
           // Pass full EvalInput to the task (data-driven params available via input.params)
diff --git a/packages/evals/initV3.ts b/packages/evals/initV3.ts
@@ -32,6 +32,7 @@ type InitV3Args = {
   domSettleTimeoutMs?: number; // retained for parity; v3 handlers accept timeouts per-call
   logger: EvalLogger;
   createAgent?: boolean; // only create an agent for agent tasks
+  isCUA?: boolean;
   configOverrides?: {
     localBrowserLaunchOptions?: Partial<
       Pick<LocalBrowserLaunchOptions, "headless" | "args">
@@ -62,10 +63,8 @@ export async function initV3({
   configOverrides,
   modelName,
   createAgent,
+  isCUA,
 }: InitV3Args): Promise<V3InitResult> {
-  // Determine if the requested model is a CUA model
-  const isCUA = modelName in modelToAgentProviderMap;
-
   // If CUA, choose a safe internal AISDK model for V3 handlers based on available API keys
   let internalModel: AvailableModel = modelName;
   if (isCUA) {
@@ -130,10 +129,19 @@ export async function initV3({
   let agent: AgentInstance | undefined;
   if (createAgent) {
     if (isCUA) {
-      const apiKey = loadApiKeyFromEnv(
-        modelToAgentProviderMap[modelName],
-        logger.log.bind(logger),
-      );
+      const shortModelName = modelName.includes("/")
+        ? modelName.split("/")[1]
+        : modelName;
+
+      const providerType = modelToAgentProviderMap[shortModelName];
+      if (!providerType) {
+        throw new Error(
+          `CUA model "${shortModelName}" not found in modelToAgentProviderMap. ` +
+            `Available: ${Object.keys(modelToAgentProviderMap).join(", ")}`,
+        );
+      }
+
+      const apiKey = loadApiKeyFromEnv(providerType, logger.log.bind(logger));
 
       const cuaModel: AvailableCuaModel | AgentModelConfig<AvailableCuaModel> =
         apiKey && apiKey.length > 0
diff --git a/packages/evals/taskConfig.ts b/packages/evals/taskConfig.ts
@@ -14,6 +14,7 @@ import fs from "fs";
 import path from "path";
 import { AvailableModel } from "@browserbasehq/stagehand";
 import { filterByEvalName } from "./args";
+import { AgentModelEntry } from "./types/evals";
 
 const ALL_EVAL_MODELS = [
   // GOOGLE
@@ -104,15 +105,27 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
       "anthropic/claude-haiku-4-5",
     ];
 
-const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
+// Standard agent models - these run with stagehand.agent()
+const AGENT_MODELS = process.env.EVAL_AGENT_MODELS
   ? process.env.EVAL_AGENT_MODELS.split(",")
+  : ["anthropic/claude-sonnet-4-20250514"];
+
+// CUA agent models - these run with stagehand.agent({ cua: true })
+const AGENT_MODELS_CUA = process.env.EVAL_AGENT_MODELS_CUA
+  ? process.env.EVAL_AGENT_MODELS_CUA.split(",")
   : [
-      "computer-use-preview-2025-03-11",
-      "claude-sonnet-4-20250514",
-      "gemini-2.5-computer-use-preview-10-2025",
-      // "anthropic/claude-sonnet-4-20250514",
+      "openai/computer-use-preview-2025-03-11",
+      "anthropic/claude-sonnet-4-20250514",
+      "google/gemini-2.5-computer-use-preview-10-2025",
     ];
 
+const AGENT_MODEL_ENTRIES: AgentModelEntry[] = [
+  ...AGENT_MODELS.map((m) => ({ modelName: m, cua: false })),
+  ...AGENT_MODELS_CUA.map((m) => ({ modelName: m, cua: true })),
+];
+
+const DEFAULT_AGENT_MODELS = AGENT_MODEL_ENTRIES.map((e) => e.modelName);
+
 /**
  * getModelList:
  * Returns a list of models to be used for the given category.
@@ -167,4 +180,10 @@ const MODELS: AvailableModel[] = getModelList().map((model) => {
   return model as AvailableModel;
 });
 
-export { tasksByName, MODELS, tasksConfig, getModelList };
+/**
+ * Get agent model entries with CUA flag for test case generation.
+ */
+const getAgentModelEntries = (): AgentModelEntry[] => AGENT_MODEL_ENTRIES;
+
+export { tasksByName, MODELS, tasksConfig, getModelList, getAgentModelEntries };
+export type { AgentModelEntry };
diff --git a/packages/evals/types/evals.ts b/packages/evals/types/evals.ts
@@ -44,6 +44,7 @@ export type EvalCategory = z.infer<typeof EvalCategorySchema>;
 export interface EvalInput {
   name: string;
   modelName: AvailableModel;
+  isCUA?: boolean;
   // Optional per-test parameters, used by data-driven tasks
   params?: Record<string, unknown>;
 }
@@ -83,3 +84,8 @@ export interface EvalResult {
 export type LogLineEval = LogLine & {
   parsedAuxiliary?: string | object;
 };
+
+export type AgentModelEntry = {
+  modelName: string;
+  cua: boolean;
+};

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@browserbasehq/stagehand-evals": patch
 +---
++
 +Update model handling in agent evals cli