Skip to content

Commit ca77062

Browse files
StingNingningding97Takhoffman
authored
Cron: fix 1/3 timeout on fresh isolated CLI runs (#30140) thanks @ningding97
Verified: - pnpm build - pnpm check - pnpm test:macmini Co-authored-by: ningding97 <[email protected]> Co-authored-by: Tak Hoffman <[email protected]>
1 parent 949200d commit ca77062

File tree

4 files changed

+170
-3
lines changed

4 files changed

+170
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ Docs: https://docs.openclaw.ai
106106
- Slack/Subagent completion delivery: stop forcing bound conversation IDs into `threadId` so Slack completion announces do not send invalid `thread_ts` for DMs/top-level channels. Landed from contributor PR #31105 by @stakeswky. Thanks @stakeswky.
107107
- Signal/Loop protection: evaluate own-account detection before sync-message filtering (including UUID-only `accountUuid` configs) so `sentTranscript` sync events cannot bypass loop protection and self-reply loops. Landed from contributor PR #31093 by @kevinWangSheng. Thanks @kevinWangSheng.
108108
- Gateway/Control UI origins: support wildcard `"*"` in `gateway.controlUi.allowedOrigins` for trusted remote access setups. Landed from contributor PR #31088 by @frankekn. Thanks @frankekn.
109+
- Cron/Isolated CLI timeout ratio: avoid reusing persisted CLI session IDs on fresh isolated cron runs so the fresh watchdog profile is used and jobs do not abort at roughly one-third of configured `timeoutSeconds`. (#30140) Thanks @ningding97.
109110
- Security/Sandbox media reads: eliminate sandbox media TOCTOU symlink-retarget escapes by enforcing root-scoped boundary-safe reads at attachment/image load time and consolidating shared safe-read helpers across sandbox media callsites. This ships in the next npm release. Thanks @tdjackey for reporting.
110111
- Security/Subagents sandbox inheritance: block sandboxed sessions from spawning cross-agent subagents that would run unsandboxed, preventing runtime sandbox downgrade via `sessions_spawn agentId`. Thanks @tdjackey for reporting.
111112
- Security/Workspace safe writes: harden `writeFileWithinRoot` against symlink-retarget TOCTOU races by opening existing files without truncation, creating missing files with exclusive create, deferring truncation until post-open identity+boundary validation, and removing out-of-root create artifacts on blocked races; added regression tests for truncate/create race paths. This ships in the next npm release (`2026.3.1`). Thanks @tdjackey for reporting.

src/cron/isolated-agent/run.skill-filter.test.ts

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,14 @@ vi.mock("../../agents/subagent-announce.js", () => ({
9595
runSubagentAnnounceFlow: vi.fn().mockResolvedValue(true),
9696
}));
9797

98+
const runCliAgentMock = vi.fn();
9899
vi.mock("../../agents/cli-runner.js", () => ({
99-
runCliAgent: vi.fn(),
100+
runCliAgent: runCliAgentMock,
100101
}));
101102

103+
const getCliSessionIdMock = vi.fn().mockReturnValue(undefined);
102104
vi.mock("../../agents/cli-session.js", () => ({
103-
getCliSessionId: vi.fn().mockReturnValue(undefined),
105+
getCliSessionId: getCliSessionIdMock,
104106
setCliSessionId: vi.fn(),
105107
}));
106108

@@ -494,4 +496,82 @@ describe("runCronIsolatedAgentTurn — skill filter", () => {
494496
expect(runWithModelFallbackMock).not.toHaveBeenCalled();
495497
});
496498
});
499+
500+
describe("CLI session handoff (issue #29774)", () => {
501+
it("does not pass stored cliSessionId on fresh isolated runs (isNewSession=true)", async () => {
502+
// Simulate a persisted CLI session ID from a previous run.
503+
getCliSessionIdMock.mockReturnValue("prev-cli-session-abc");
504+
isCliProviderMock.mockReturnValue(true);
505+
runCliAgentMock.mockResolvedValue({
506+
payloads: [{ text: "output" }],
507+
meta: { agentMeta: { sessionId: "new-cli-session-xyz", usage: { input: 5, output: 10 } } },
508+
});
509+
// Make runWithModelFallback invoke the run callback so the CLI path executes.
510+
runWithModelFallbackMock.mockImplementationOnce(
511+
async (params: { run: (provider: string, model: string) => Promise<unknown> }) => {
512+
const result = await params.run("claude-cli", "claude-opus-4-6");
513+
return { result, provider: "claude-cli", model: "claude-opus-4-6", attempts: [] };
514+
},
515+
);
516+
resolveCronSessionMock.mockReturnValue({
517+
storePath: "/tmp/store.json",
518+
store: {},
519+
sessionEntry: {
520+
sessionId: "test-session-fresh",
521+
updatedAt: 0,
522+
systemSent: false,
523+
skillsSnapshot: undefined,
524+
// A stored CLI session ID that should NOT be reused on fresh runs.
525+
cliSessionIds: { "claude-cli": "prev-cli-session-abc" },
526+
},
527+
systemSent: false,
528+
isNewSession: true,
529+
});
530+
531+
await runCronIsolatedAgentTurn(makeParams());
532+
533+
expect(runCliAgentMock).toHaveBeenCalledOnce();
534+
// Fresh session: cliSessionId must be undefined, not the stored value.
535+
expect(runCliAgentMock.mock.calls[0][0]).toHaveProperty("cliSessionId", undefined);
536+
});
537+
538+
it("reuses stored cliSessionId on continuation runs (isNewSession=false)", async () => {
539+
getCliSessionIdMock.mockReturnValue("existing-cli-session-def");
540+
isCliProviderMock.mockReturnValue(true);
541+
runCliAgentMock.mockResolvedValue({
542+
payloads: [{ text: "output" }],
543+
meta: {
544+
agentMeta: { sessionId: "existing-cli-session-def", usage: { input: 5, output: 10 } },
545+
},
546+
});
547+
runWithModelFallbackMock.mockImplementationOnce(
548+
async (params: { run: (provider: string, model: string) => Promise<unknown> }) => {
549+
const result = await params.run("claude-cli", "claude-opus-4-6");
550+
return { result, provider: "claude-cli", model: "claude-opus-4-6", attempts: [] };
551+
},
552+
);
553+
resolveCronSessionMock.mockReturnValue({
554+
storePath: "/tmp/store.json",
555+
store: {},
556+
sessionEntry: {
557+
sessionId: "test-session-continuation",
558+
updatedAt: 0,
559+
systemSent: false,
560+
skillsSnapshot: undefined,
561+
cliSessionIds: { "claude-cli": "existing-cli-session-def" },
562+
},
563+
systemSent: false,
564+
isNewSession: false,
565+
});
566+
567+
await runCronIsolatedAgentTurn(makeParams());
568+
569+
expect(runCliAgentMock).toHaveBeenCalledOnce();
570+
// Continuation: cliSessionId should be passed through for session resume.
571+
expect(runCliAgentMock.mock.calls[0][0]).toHaveProperty(
572+
"cliSessionId",
573+
"existing-cli-session-def",
574+
);
575+
});
576+
});
497577
});

src/cron/isolated-agent/run.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,14 @@ export async function runCronIsolatedAgentTurn(params: {
463463
throw new Error(abortReason());
464464
}
465465
if (isCliProvider(providerOverride, cfgWithAgentDefaults)) {
466-
const cliSessionId = getCliSessionId(cronSession.sessionEntry, providerOverride);
466+
// Fresh isolated cron sessions must not reuse a stored CLI session ID.
467+
// Passing an existing ID activates the resume watchdog profile
468+
// (noOutputTimeoutRatio 0.3, maxMs 180 s) instead of the fresh profile
469+
// (ratio 0.8, maxMs 600 s), causing jobs to time out at roughly 1/3 of
470+
// the configured timeoutSeconds. See: https://github.com/openclaw/openclaw/issues/29774
471+
const cliSessionId = cronSession.isNewSession
472+
? undefined
473+
: getCliSessionId(cronSession.sessionEntry, providerOverride);
467474
return runCliAgent({
468475
sessionId: cronSession.sessionEntry.sessionId,
469476
sessionKey: agentSessionKey,

src/cron/service.issue-regressions.test.ts

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,4 +1513,83 @@ describe("Cron issue regressions", () => {
15131513
expect(jobs.find((job) => job.id === first.id)?.state.lastStatus).toBe("ok");
15141514
expect(jobs.find((job) => job.id === second.id)?.state.lastStatus).toBe("ok");
15151515
});
1516+
1517+
// Regression: isolated cron runs must not abort at 1/3 of configured timeoutSeconds.
1518+
// The bug (issue #29774) caused the CLI-provider resume watchdog (ratio 0.3, maxMs 180 s)
1519+
// to be applied on fresh sessions because a persisted cliSessionId was passed to
1520+
// runCliAgent even when isNewSession=true. At the service level this manifests as a
1521+
// job abort that fires much sooner than the configured outer timeout.
1522+
it("outer cron timeout fires at configured timeoutSeconds, not at 1/3 (#29774)", async () => {
1523+
vi.useRealTimers();
1524+
const store = await makeStorePath();
1525+
const scheduledAt = Date.parse("2026-02-15T13:00:00.000Z");
1526+
1527+
// Use a short but observable timeout: 300 ms.
1528+
// Before the fix, premature timeout would fire at ~100 ms (1/3 of 300 ms).
1529+
const timeoutSeconds = 0.3;
1530+
const cronJob = createIsolatedRegressionJob({
1531+
id: "timeout-fraction-29774",
1532+
name: "timeout fraction regression",
1533+
scheduledAt,
1534+
schedule: { kind: "at", at: new Date(scheduledAt).toISOString() },
1535+
payload: { kind: "agentTurn", message: "work", timeoutSeconds },
1536+
state: { nextRunAtMs: scheduledAt },
1537+
});
1538+
await writeCronJobs(store.storePath, [cronJob]);
1539+
1540+
const tempFile = path.join(os.tmpdir(), `cron-29774-${Date.now()}.txt`);
1541+
let now = scheduledAt;
1542+
const wallStart = Date.now();
1543+
let abortWallMs: number | undefined;
1544+
1545+
const state = createCronServiceState({
1546+
cronEnabled: true,
1547+
storePath: store.storePath,
1548+
log: noopLogger,
1549+
nowMs: () => now,
1550+
enqueueSystemEvent: vi.fn(),
1551+
requestHeartbeatNow: vi.fn(),
1552+
runIsolatedAgentJob: vi.fn(async ({ abortSignal }: { abortSignal?: AbortSignal }) => {
1553+
// Real side effect: confirm the job actually started.
1554+
await fs.writeFile(tempFile, "started", "utf-8");
1555+
await new Promise<void>((resolve) => {
1556+
if (!abortSignal) {
1557+
resolve();
1558+
return;
1559+
}
1560+
if (abortSignal.aborted) {
1561+
abortWallMs = Date.now();
1562+
resolve();
1563+
return;
1564+
}
1565+
abortSignal.addEventListener(
1566+
"abort",
1567+
() => {
1568+
abortWallMs = Date.now();
1569+
resolve();
1570+
},
1571+
{ once: true },
1572+
);
1573+
});
1574+
now += 5;
1575+
return { status: "ok" as const, summary: "done" };
1576+
}),
1577+
});
1578+
1579+
await onTimer(state);
1580+
1581+
// Confirm job started (real side effect).
1582+
await expect(fs.readFile(tempFile, "utf-8")).resolves.toBe("started");
1583+
await fs.unlink(tempFile).catch(() => {});
1584+
1585+
// The outer cron timeout fires at timeoutSeconds * 1000 = 300 ms.
1586+
// The abort must not have fired at ~100 ms (the 1/3 regression value).
1587+
// Allow generous lower bound (80%) to keep the test stable on loaded CI runners.
1588+
const elapsedMs = (abortWallMs ?? Date.now()) - wallStart;
1589+
expect(elapsedMs).toBeGreaterThanOrEqual(timeoutSeconds * 1000 * 0.8);
1590+
1591+
const job = state.store?.jobs.find((entry) => entry.id === "timeout-fraction-29774");
1592+
expect(job?.state.lastStatus).toBe("error");
1593+
expect(job?.state.lastError).toContain("timed out");
1594+
});
15161595
});

0 commit comments

Comments
 (0)