Skip to content

Commit ca0630e

Browse files
authored
Update the agent evals cli (#1364)
# why After the transition to v3, the model handling for agent evals was not updated to account for new model formats # what changed - added isCua flag and two separate model maps to allow for models that can be ran with cua and non - adjusted model handling to properly parse cua models - added tag to distinguish if the run is using cua or non # test plan - tested evals for cua, and non cua <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Updated the agent evals CLI to support and correctly run both CUA and non-CUA agent models in v3. Fixes agent model parsing and enables mixed eval runs. - **New Features** - Split agent models into standard and CUA lists; added getAgentModelEntries with a cua flag. - Passed isCUA through EvalInput to initV3 and tasks; selects a safe internal model for handlers when CUA. - Improved provider lookup and error messages for CUA models using short names; testcases now tag models as "cua" or "agent". <sup>Written for commit 13b906c. Summary will update automatically on new commits.</sup> <!-- End of auto-generated description by cubic. -->
1 parent 898e1f4 commit ca0630e

File tree

5 files changed

+77
-18
lines changed

5 files changed

+77
-18
lines changed

.changeset/fruity-hounds-rush.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@browserbasehq/stagehand-evals": patch
3+
---
4+
5+
Update model handling in agent evals cli

packages/evals/index.eval.ts

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,12 @@ import {
2121
} from "./args";
2222
import { generateExperimentName } from "./utils";
2323
import { exactMatch, errorMatch } from "./scoring";
24-
import { tasksByName, tasksConfig, getModelList } from "./taskConfig";
24+
import {
25+
tasksByName,
26+
tasksConfig,
27+
getModelList,
28+
getAgentModelEntries,
29+
} from "./taskConfig";
2530
import { Eval } from "braintrust";
2631
import { SummaryResult, Testcase, EvalInput } from "./types/evals";
2732
import { EvalLogger } from "./logger";
@@ -171,19 +176,33 @@ const generateFilteredTestcases = (): Testcase[] => {
171176
}
172177

173178
// Create a list of all remaining testcases using the determined task names and models
174-
const regularTestcases = currentModels.flatMap((model) =>
179+
const isAgentCategory =
180+
effectiveCategory === "agent" ||
181+
effectiveCategory === "external_agent_benchmarks";
182+
183+
// Use agent model entries (with cua flag) for agent categories, otherwise map currentModels
184+
const modelEntries = isAgentCategory
185+
? getAgentModelEntries()
186+
: currentModels.map((m) => ({ modelName: m, cua: false }));
187+
188+
const regularTestcases = modelEntries.flatMap((entry) =>
175189
taskNamesToRun.map((testName) => ({
176-
input: { name: testName, modelName: model as AvailableModel },
190+
input: {
191+
name: testName,
192+
modelName: entry.modelName as AvailableModel,
193+
...(isAgentCategory && { isCUA: entry.cua }),
194+
},
177195
name: testName,
178196
tags: [
179-
model,
197+
entry.modelName,
198+
...(isAgentCategory ? [entry.cua ? "cua" : "agent"] : []),
180199
testName,
181200
...(tasksConfig.find((t) => t.name === testName)?.categories || []).map(
182201
(x) => `category/${x}`,
183202
),
184203
],
185204
metadata: {
186-
model: model as AvailableModel,
205+
model: entry.modelName as AvailableModel,
187206
test: testName,
188207
},
189208
expected: true,
@@ -344,6 +363,7 @@ const generateFilteredTestcases = (): Testcase[] => {
344363
modelName: input.modelName,
345364
modelClientOptions: { apiKey: apiKey },
346365
createAgent: isAgentTask,
366+
isCUA: input.isCUA,
347367
});
348368
} else {
349369
let llmClient: LLMClient;
@@ -360,6 +380,7 @@ const generateFilteredTestcases = (): Testcase[] => {
360380
llmClient,
361381
modelName: input.modelName,
362382
createAgent: isAgentTask,
383+
isCUA: input.isCUA,
363384
});
364385
}
365386
// Pass full EvalInput to the task (data-driven params available via input.params)

packages/evals/initV3.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type InitV3Args = {
3232
domSettleTimeoutMs?: number; // retained for parity; v3 handlers accept timeouts per-call
3333
logger: EvalLogger;
3434
createAgent?: boolean; // only create an agent for agent tasks
35+
isCUA?: boolean;
3536
configOverrides?: {
3637
localBrowserLaunchOptions?: Partial<
3738
Pick<LocalBrowserLaunchOptions, "headless" | "args">
@@ -62,10 +63,8 @@ export async function initV3({
6263
configOverrides,
6364
modelName,
6465
createAgent,
66+
isCUA,
6567
}: InitV3Args): Promise<V3InitResult> {
66-
// Determine if the requested model is a CUA model
67-
const isCUA = modelName in modelToAgentProviderMap;
68-
6968
// If CUA, choose a safe internal AISDK model for V3 handlers based on available API keys
7069
let internalModel: AvailableModel = modelName;
7170
if (isCUA) {
@@ -130,10 +129,19 @@ export async function initV3({
130129
let agent: AgentInstance | undefined;
131130
if (createAgent) {
132131
if (isCUA) {
133-
const apiKey = loadApiKeyFromEnv(
134-
modelToAgentProviderMap[modelName],
135-
logger.log.bind(logger),
136-
);
132+
const shortModelName = modelName.includes("/")
133+
? modelName.split("/")[1]
134+
: modelName;
135+
136+
const providerType = modelToAgentProviderMap[shortModelName];
137+
if (!providerType) {
138+
throw new Error(
139+
`CUA model "${shortModelName}" not found in modelToAgentProviderMap. ` +
140+
`Available: ${Object.keys(modelToAgentProviderMap).join(", ")}`,
141+
);
142+
}
143+
144+
const apiKey = loadApiKeyFromEnv(providerType, logger.log.bind(logger));
137145

138146
const cuaModel: AvailableCuaModel | AgentModelConfig<AvailableCuaModel> =
139147
apiKey && apiKey.length > 0

packages/evals/taskConfig.ts

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import fs from "fs";
1414
import path from "path";
1515
import { AvailableModel } from "@browserbasehq/stagehand";
1616
import { filterByEvalName } from "./args";
17+
import { AgentModelEntry } from "./types/evals";
1718

1819
const ALL_EVAL_MODELS = [
1920
// GOOGLE
@@ -104,15 +105,27 @@ const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
104105
"anthropic/claude-haiku-4-5",
105106
];
106107

107-
const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
108+
// Standard agent models - these run with stagehand.agent()
109+
const AGENT_MODELS = process.env.EVAL_AGENT_MODELS
108110
? process.env.EVAL_AGENT_MODELS.split(",")
111+
: ["anthropic/claude-sonnet-4-20250514"];
112+
113+
// CUA agent models - these run with stagehand.agent({ cua: true })
114+
const AGENT_MODELS_CUA = process.env.EVAL_AGENT_MODELS_CUA
115+
? process.env.EVAL_AGENT_MODELS_CUA.split(",")
109116
: [
110-
"computer-use-preview-2025-03-11",
111-
"claude-sonnet-4-20250514",
112-
"gemini-2.5-computer-use-preview-10-2025",
113-
// "anthropic/claude-sonnet-4-20250514",
117+
"openai/computer-use-preview-2025-03-11",
118+
"anthropic/claude-sonnet-4-20250514",
119+
"google/gemini-2.5-computer-use-preview-10-2025",
114120
];
115121

122+
const AGENT_MODEL_ENTRIES: AgentModelEntry[] = [
123+
...AGENT_MODELS.map((m) => ({ modelName: m, cua: false })),
124+
...AGENT_MODELS_CUA.map((m) => ({ modelName: m, cua: true })),
125+
];
126+
127+
const DEFAULT_AGENT_MODELS = AGENT_MODEL_ENTRIES.map((e) => e.modelName);
128+
116129
/**
117130
* getModelList:
118131
* Returns a list of models to be used for the given category.
@@ -167,4 +180,10 @@ const MODELS: AvailableModel[] = getModelList().map((model) => {
167180
return model as AvailableModel;
168181
});
169182

170-
export { tasksByName, MODELS, tasksConfig, getModelList };
183+
/**
184+
* Get agent model entries with CUA flag for test case generation.
185+
*/
186+
const getAgentModelEntries = (): AgentModelEntry[] => AGENT_MODEL_ENTRIES;
187+
188+
export { tasksByName, MODELS, tasksConfig, getModelList, getAgentModelEntries };
189+
export type { AgentModelEntry };

packages/evals/types/evals.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ export type EvalCategory = z.infer<typeof EvalCategorySchema>;
4444
export interface EvalInput {
4545
name: string;
4646
modelName: AvailableModel;
47+
isCUA?: boolean;
4748
// Optional per-test parameters, used by data-driven tasks
4849
params?: Record<string, unknown>;
4950
}
@@ -83,3 +84,8 @@ export interface EvalResult {
8384
export type LogLineEval = LogLine & {
8485
parsedAuxiliary?: string | object;
8586
};
87+
88+
export type AgentModelEntry = {
89+
modelName: string;
90+
cua: boolean;
91+
};

0 commit comments

Comments
 (0)