Skip to content

Commit ed51c0e

Browse files
committed
feat(web-evals): add iterations support and database migration
- Add iterations slider (1-10) to new run form - Add iteration column to tasks table schema - Add ESC key handler to close task log dialog - Update run display to show iteration number for repeated tasks - Add database migration for iteration column
1 parent 861ad71 commit ed51c0e

File tree

9 files changed

+618
-8
lines changed

9 files changed

+618
-8
lines changed

apps/web-evals/src/actions/runs.ts

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import { CreateRun } from "@/lib/schemas"
2121

2222
const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals")
2323

24-
export async function createRun({ suite, exercises = [], timeout, ...values }: CreateRun) {
24+
export async function createRun({ suite, exercises = [], timeout, iterations = 1, ...values }: CreateRun) {
2525
const run = await _createRun({
2626
...values,
2727
timeout,
@@ -36,15 +36,34 @@ export async function createRun({ suite, exercises = [], timeout, ...values }: C
3636
throw new Error("Invalid exercise path: " + path)
3737
}
3838

39-
await createTask({ ...values, runId: run.id, language: language as ExerciseLanguage, exercise })
39+
// Create multiple tasks for each iteration
40+
for (let iteration = 1; iteration <= iterations; iteration++) {
41+
await createTask({
42+
...values,
43+
runId: run.id,
44+
language: language as ExerciseLanguage,
45+
exercise,
46+
iteration,
47+
})
48+
}
4049
}
4150
} else {
4251
for (const language of exerciseLanguages) {
43-
const exercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
52+
const languageExercises = await getExercisesForLanguage(EVALS_REPO_PATH, language)
53+
54+
// Create tasks for all iterations of each exercise
55+
const tasksToCreate: Array<{ language: ExerciseLanguage; exercise: string; iteration: number }> = []
56+
for (const exercise of languageExercises) {
57+
for (let iteration = 1; iteration <= iterations; iteration++) {
58+
tasksToCreate.push({ language, exercise, iteration })
59+
}
60+
}
4461

45-
await pMap(exercises, (exercise) => createTask({ runId: run.id, language, exercise }), {
46-
concurrency: 10,
47-
})
62+
await pMap(
63+
tasksToCreate,
64+
({ language, exercise, iteration }) => createTask({ runId: run.id, language, exercise, iteration }),
65+
{ concurrency: 10 },
66+
)
4867
}
4968
}
5069

apps/web-evals/src/app/runs/[id]/run.tsx

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"use client"
22

3-
import { useMemo, useState, useCallback } from "react"
3+
import { useMemo, useState, useCallback, useEffect } from "react"
44
import { toast } from "sonner"
55
import { LoaderCircle, FileText, Copy, Check } from "lucide-react"
66

@@ -116,6 +116,18 @@ export function Run({ run }: { run: Run }) {
116116
}
117117
}, [taskLog])
118118

119+
// Handle ESC key to close the dialog
120+
useEffect(() => {
121+
const handleKeyDown = (e: KeyboardEvent) => {
122+
if (e.key === "Escape" && selectedTask) {
123+
setSelectedTask(null)
124+
}
125+
}
126+
127+
document.addEventListener("keydown", handleKeyDown)
128+
return () => document.removeEventListener("keydown", handleKeyDown)
129+
}, [selectedTask])
130+
119131
const onViewTaskLog = useCallback(
120132
async (task: Task) => {
121133
// Only allow viewing logs for completed tasks
@@ -366,6 +378,11 @@ export function Run({ run }: { run: Run }) {
366378
<div className="flex items-center gap-2">
367379
<span>
368380
{task.language}/{task.exercise}
381+
{task.iteration > 1 && (
382+
<span className="text-muted-foreground ml-1">
383+
(#{task.iteration})
384+
</span>
385+
)}
369386
</span>
370387
{task.finishedAt && (
371388
<Tooltip>
@@ -416,6 +433,9 @@ export function Run({ run }: { run: Run }) {
416433
<DialogTitle className="flex items-center gap-2">
417434
<FileText className="size-4" />
418435
{selectedTask?.language}/{selectedTask?.exercise}
436+
{selectedTask?.iteration && selectedTask.iteration > 1 && (
437+
<span className="text-muted-foreground">(#{selectedTask.iteration})</span>
438+
)}
419439
<span
420440
className={`ml-2 text-sm ${selectedTask?.passed ? "text-green-600" : "text-red-600"}`}>
421441
({selectedTask?.passed ? "Passed" : "Failed"})

apps/web-evals/src/app/runs/new/new-run.tsx

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ import {
3131
TIMEOUT_MIN,
3232
TIMEOUT_MAX,
3333
TIMEOUT_DEFAULT,
34+
ITERATIONS_MIN,
35+
ITERATIONS_MAX,
36+
ITERATIONS_DEFAULT,
3437
} from "@/lib/schemas"
3538
import { cn } from "@/lib/utils"
3639

@@ -41,6 +44,7 @@ import {
4144
Button,
4245
Checkbox,
4346
FormControl,
47+
FormDescription,
4448
FormField,
4549
FormItem,
4650
FormLabel,
@@ -116,6 +120,7 @@ export function NewRun() {
116120
settings: undefined,
117121
concurrency: CONCURRENCY_DEFAULT,
118122
timeout: TIMEOUT_DEFAULT,
123+
iterations: ITERATIONS_DEFAULT,
119124
jobToken: "",
120125
},
121126
})
@@ -727,6 +732,32 @@ export function NewRun() {
727732
)}
728733
/>
729734

735+
<FormField
736+
control={form.control}
737+
name="iterations"
738+
render={({ field }) => (
739+
<FormItem>
740+
<FormLabel>Iterations per Exercise</FormLabel>
741+
<FormControl>
742+
<div className="flex flex-row items-center gap-2">
743+
<Slider
744+
value={[field.value]}
745+
min={ITERATIONS_MIN}
746+
max={ITERATIONS_MAX}
747+
step={1}
748+
onValueChange={(value) => {
749+
field.onChange(value[0])
750+
}}
751+
/>
752+
<div>{field.value}</div>
753+
</div>
754+
</FormControl>
755+
<FormDescription>Run each exercise multiple times to compare results</FormDescription>
756+
<FormMessage />
757+
</FormItem>
758+
)}
759+
/>
760+
730761
<FormField
731762
control={form.control}
732763
name="description"

apps/web-evals/src/lib/schemas.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ export const TIMEOUT_MIN = 5
1414
export const TIMEOUT_MAX = 10
1515
export const TIMEOUT_DEFAULT = 5
1616

17+
export const ITERATIONS_MIN = 1
18+
export const ITERATIONS_MAX = 10
19+
export const ITERATIONS_DEFAULT = 1
20+
1721
export const createRunSchema = z
1822
.object({
1923
model: z.string().min(1, { message: "Model is required." }),
@@ -23,6 +27,7 @@ export const createRunSchema = z
2327
settings: rooCodeSettingsSchema.optional(),
2428
concurrency: z.number().int().min(CONCURRENCY_MIN).max(CONCURRENCY_MAX),
2529
timeout: z.number().int().min(TIMEOUT_MIN).max(TIMEOUT_MAX),
30+
iterations: z.number().int().min(ITERATIONS_MIN).max(ITERATIONS_MAX),
2631
jobToken: z.string().optional(),
2732
})
2833
.refine((data) => data.suite === "full" || (data.exercises || []).length > 0, {
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Development overrides - automatically loaded by docker compose
2+
# These settings only apply when running locally for development
3+
#
4+
# For production, use: docker compose -f docker-compose.yml up
5+
# (explicitly exclude override file)
6+
7+
services:
8+
web:
9+
environment:
10+
- NODE_ENV=development
11+
volumes:
12+
# Mount log files so web can access task logs
13+
- /tmp/evals:/tmp/evals:ro
14+
# Mount source code for hot reload in development
15+
- ../../apps/web-evals:/roo/repo/apps/web-evals:delegated
16+
- ../../packages/evals:/roo/repo/packages/evals:delegated
17+
- ../../packages/types:/roo/repo/packages/types:delegated
18+
- ../../packages/ipc:/roo/repo/packages/ipc:delegated
19+
- ../../packages/cloud:/roo/repo/packages/cloud:delegated
20+
# Exclude node_modules from being overwritten
21+
- /roo/repo/node_modules
22+
- /roo/repo/apps/web-evals/node_modules
23+
- /roo/repo/packages/evals/node_modules
24+
- /roo/repo/packages/types/node_modules
25+
- /roo/repo/packages/ipc/node_modules
26+
- /roo/repo/packages/cloud/node_modules
27+
entrypoint: []
28+
command:
29+
- sh
30+
- -c
31+
- |
32+
echo '🚀 Starting evals web service in development mode...'
33+
wait_for_db() {
34+
echo '⏳ Waiting for database...'
35+
until pg_isready -h db -p 5432 -U postgres -d evals_development > /dev/null 2>&1; do
36+
echo '⏳ Database not ready yet, waiting 2 seconds...'
37+
sleep 2
38+
done
39+
echo '✅ Database is ready'
40+
}
41+
wait_for_db
42+
echo '🔄 Running database migrations...'
43+
pnpm --filter @roo-code/evals db:migrate
44+
echo '🌐 Starting Next.js dev server...'
45+
cd /roo/repo/apps/web-evals && npx next dev -p 3446
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
DROP INDEX "tasks_language_exercise_idx";--> statement-breakpoint
2+
ALTER TABLE "tasks" ADD COLUMN "iteration" integer DEFAULT 1 NOT NULL;--> statement-breakpoint
3+
CREATE UNIQUE INDEX "tasks_language_exercise_iteration_idx" ON "tasks" USING btree ("run_id","language","exercise","iteration");

0 commit comments

Comments
 (0)