Skip to content

Commit aaa7de4

Browse files
FlamesCNTakhoffman
andauthored
fix(cron): prevent armTimer tight loop when job has stuck runningAtMs (#29853) thanks @FlamesCN
Verified: - pnpm build - pnpm check - pnpm test:macmini Co-authored-by: FlamesCN <[email protected]> Co-authored-by: Tak Hoffman <[email protected]>
1 parent ffe1937 commit aaa7de4

File tree

3 files changed

+195
-1
lines changed

3 files changed

+195
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ Docs: https://docs.openclaw.ai
107107
- Telegram/Empty final replies: skip outbound send for null/undefined final text payloads without media so Telegram typing indicators do not linger on `text must be non-empty` errors. Landed from contributor PR #30969 by @haosenwang1018. Thanks @haosenwang1018.
108108
- Routing/Binding peer-kind parity: treat `peer.kind` `group` and `channel` as equivalent for binding scope matching (while keeping `direct` separate) so Slack/public channel bindings do not silently fall through. Landed from contributor PR #31135 by @Sid-Qin. Thanks @Sid-Qin.
109109
- Agents/FS workspace default: honor documented host file-tool default `tools.fs.workspaceOnly=false` when unset so host `write`/`edit` calls are not incorrectly workspace-restricted unless explicitly enabled. Landed from contributor PR #31128 by @SaucePackets. Thanks @SaucePackets.
110+
- Cron/Timer hot-loop guard: enforce a minimum timer re-arm delay when stale past-due jobs would otherwise trigger repeated `setTimeout(0)` loops, preventing event-loop saturation and log-flood behavior. (#29853) Thanks @FlamesCN.
110111
- Gateway/CLI session recovery: handle expired CLI session IDs gracefully by clearing stale session state and retrying without crashing gateway runs. Landed from contributor PR #31090 by @frankekn. Thanks @frankekn.
111112
- Slack/Subagent completion delivery: stop forcing bound conversation IDs into `threadId` so Slack completion announces do not send invalid `thread_ts` for DMs/top-level channels. Landed from contributor PR #31105 by @stakeswky. Thanks @stakeswky.
112113
- Signal/Loop protection: evaluate own-account detection before sync-message filtering (including UUID-only `accountUuid` configs) so `sentTranscript` sync events cannot bypass loop protection and self-reply loops. Landed from contributor PR #31093 by @kevinWangSheng. Thanks @kevinWangSheng.
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
import fs from "node:fs/promises";
2+
import path from "node:path";
3+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
4+
import { createNoopLogger, createCronStoreHarness } from "./service.test-harness.js";
5+
import { createCronServiceState } from "./service/state.js";
6+
import { armTimer, onTimer } from "./service/timer.js";
7+
import type { CronJob } from "./types.js";
8+
9+
const noopLogger = createNoopLogger();
10+
const { makeStorePath } = createCronStoreHarness({ prefix: "openclaw-cron-tight-loop-" });
11+
12+
/**
13+
* Create a cron job that is past-due AND has a stuck `runningAtMs` marker.
14+
* This combination causes `findDueJobs` to return `[]` (blocked by
15+
* `runningAtMs`) while `nextWakeAtMs` still returns the past-due timestamp,
16+
* which before the fix resulted in a `setTimeout(0)` tight loop.
17+
*/
18+
function createStuckPastDueJob(params: { id: string; nowMs: number; pastDueMs: number }): CronJob {
19+
const pastDueAt = params.nowMs - params.pastDueMs;
20+
return {
21+
id: params.id,
22+
name: "stuck-job",
23+
enabled: true,
24+
deleteAfterRun: false,
25+
createdAtMs: pastDueAt - 60_000,
26+
updatedAtMs: pastDueAt - 60_000,
27+
schedule: { kind: "cron", expr: "*/15 * * * *" },
28+
sessionTarget: "isolated",
29+
wakeMode: "next-heartbeat",
30+
payload: { kind: "agentTurn", message: "monitor" },
31+
delivery: { mode: "none" },
32+
state: {
33+
nextRunAtMs: pastDueAt,
34+
// Stuck: set from a previous execution that was interrupted.
35+
// Not yet old enough for STUCK_RUN_MS (2 h) to clear it.
36+
runningAtMs: pastDueAt + 1,
37+
},
38+
};
39+
}
40+
41+
describe("CronService - armTimer tight loop prevention", () => {
42+
beforeEach(() => {
43+
noopLogger.debug.mockClear();
44+
noopLogger.info.mockClear();
45+
noopLogger.warn.mockClear();
46+
noopLogger.error.mockClear();
47+
});
48+
49+
afterEach(() => {
50+
vi.clearAllMocks();
51+
});
52+
53+
it("enforces a minimum delay when the next wake time is in the past", () => {
54+
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
55+
const now = Date.parse("2026-02-28T12:32:00.000Z");
56+
const pastDueMs = 17 * 60 * 1000; // 17 minutes past due
57+
58+
const state = createCronServiceState({
59+
storePath: "/tmp/test-cron/jobs.json",
60+
cronEnabled: true,
61+
log: noopLogger,
62+
nowMs: () => now,
63+
enqueueSystemEvent: vi.fn(),
64+
requestHeartbeatNow: vi.fn(),
65+
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
66+
});
67+
state.store = {
68+
version: 1,
69+
jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })],
70+
};
71+
72+
armTimer(state);
73+
74+
expect(state.timer).not.toBeNull();
75+
const delays = timeoutSpy.mock.calls
76+
.map(([, delay]) => delay)
77+
.filter((d): d is number => typeof d === "number");
78+
79+
// Before the fix, delay would be 0 (tight loop).
80+
// After the fix, delay must be >= MIN_REFIRE_GAP_MS (2000 ms).
81+
expect(delays.length).toBeGreaterThan(0);
82+
for (const d of delays) {
83+
expect(d).toBeGreaterThanOrEqual(2_000);
84+
}
85+
86+
timeoutSpy.mockRestore();
87+
});
88+
89+
it("does not add extra delay when the next wake time is in the future", () => {
90+
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
91+
const now = Date.parse("2026-02-28T12:32:00.000Z");
92+
93+
const state = createCronServiceState({
94+
storePath: "/tmp/test-cron/jobs.json",
95+
cronEnabled: true,
96+
log: noopLogger,
97+
nowMs: () => now,
98+
enqueueSystemEvent: vi.fn(),
99+
requestHeartbeatNow: vi.fn(),
100+
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
101+
});
102+
state.store = {
103+
version: 1,
104+
jobs: [
105+
{
106+
id: "future-job",
107+
name: "future-job",
108+
enabled: true,
109+
deleteAfterRun: false,
110+
createdAtMs: now,
111+
updatedAtMs: now,
112+
schedule: { kind: "cron", expr: "*/15 * * * *" },
113+
sessionTarget: "isolated" as const,
114+
wakeMode: "next-heartbeat" as const,
115+
payload: { kind: "agentTurn" as const, message: "test" },
116+
delivery: { mode: "none" as const },
117+
state: { nextRunAtMs: now + 10_000 }, // 10 seconds in the future
118+
},
119+
],
120+
};
121+
122+
armTimer(state);
123+
124+
const delays = timeoutSpy.mock.calls
125+
.map(([, delay]) => delay)
126+
.filter((d): d is number => typeof d === "number");
127+
128+
// The natural delay (10 s) should be used, not the floor.
129+
expect(delays).toContain(10_000);
130+
131+
timeoutSpy.mockRestore();
132+
});
133+
134+
it("breaks the onTimer→armTimer hot-loop with stuck runningAtMs", async () => {
135+
const timeoutSpy = vi.spyOn(globalThis, "setTimeout");
136+
const store = await makeStorePath();
137+
const now = Date.parse("2026-02-28T12:32:00.000Z");
138+
const pastDueMs = 17 * 60 * 1000;
139+
140+
await fs.mkdir(path.dirname(store.storePath), { recursive: true });
141+
await fs.writeFile(
142+
store.storePath,
143+
JSON.stringify(
144+
{
145+
version: 1,
146+
jobs: [createStuckPastDueJob({ id: "monitor", nowMs: now, pastDueMs })],
147+
},
148+
null,
149+
2,
150+
),
151+
"utf-8",
152+
);
153+
154+
const state = createCronServiceState({
155+
storePath: store.storePath,
156+
cronEnabled: true,
157+
log: noopLogger,
158+
nowMs: () => now,
159+
enqueueSystemEvent: vi.fn(),
160+
requestHeartbeatNow: vi.fn(),
161+
runIsolatedAgentJob: vi.fn().mockResolvedValue({ status: "ok" }),
162+
});
163+
164+
// Simulate the onTimer path: it will find no runnable jobs (blocked by
165+
// runningAtMs) and re-arm the timer in its finally block.
166+
await onTimer(state);
167+
168+
expect(state.running).toBe(false);
169+
expect(state.timer).not.toBeNull();
170+
171+
// The re-armed timer must NOT use delay=0. It should use at least
172+
// MIN_REFIRE_GAP_MS to prevent the hot-loop.
173+
const allDelays = timeoutSpy.mock.calls
174+
.map(([, delay]) => delay)
175+
.filter((d): d is number => typeof d === "number");
176+
177+
// The last setTimeout call is from the finally→armTimer path.
178+
const lastDelay = allDelays[allDelays.length - 1];
179+
expect(lastDelay).toBeGreaterThanOrEqual(2_000);
180+
181+
timeoutSpy.mockRestore();
182+
await store.cleanup();
183+
});
184+
});

src/cron/service/timer.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,9 +446,18 @@ export function armTimer(state: CronServiceState) {
446446
}
447447
const now = state.deps.nowMs();
448448
const delay = Math.max(nextAt - now, 0);
449+
// Floor: when the next wake time is in the past (delay === 0), enforce a
450+
// minimum delay to prevent a tight setTimeout(0) loop. This can happen
451+
// when a job has a stuck runningAtMs marker and a past-due nextRunAtMs:
452+
// findDueJobs skips the job (blocked by runningAtMs), while
453+
// recomputeNextRunsForMaintenance intentionally does not advance the
454+
// past-due nextRunAtMs (per #13992). The finally block in onTimer then
455+
// re-invokes armTimer with delay === 0, creating an infinite hot-loop
456+
// that saturates the event loop and fills the log file to its size cap.
457+
const flooredDelay = delay === 0 ? MIN_REFIRE_GAP_MS : delay;
449458
// Wake at least once a minute to avoid schedule drift and recover quickly
450459
// when the process was paused or wall-clock time jumps.
451-
const clampedDelay = Math.min(delay, MAX_TIMER_DELAY_MS);
460+
const clampedDelay = Math.min(flooredDelay, MAX_TIMER_DELAY_MS);
452461
// Intentionally avoid an `async` timer callback:
453462
// Vitest's fake-timer helpers can await async callbacks, which would block
454463
// tests that simulate long-running jobs. Runtime behavior is unchanged.

0 commit comments

Comments
 (0)