Skip to content

Commit 4637b90

Browse files
0xbrakTakhoffman
andauthored
feat(cron): configurable failure alerts for repeated job errors (#24789) thanks @0xbrak
Verified: - pnpm install --frozen-lockfile - pnpm check - pnpm test -- --run src/cron/service.failure-alert.test.ts src/cli/cron-cli.test.ts src/gateway/protocol/cron-validators.test.ts Co-authored-by: 0xbrak <[email protected]> Co-authored-by: Tak Hoffman <[email protected]>
1 parent f902697 commit 4637b90

File tree

18 files changed

+842
-1
lines changed

18 files changed

+842
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ Docs: https://docs.openclaw.ai
8080

8181
### Fixes
8282

83+
- Cron/Failure alerts: add configurable repeated-failure alerting with per-job overrides and Web UI cron editor support (`inherit|disabled|custom` with threshold/cooldown/channel/target fields). (#24789) Thanks xbrak.
8384
- Cron/Isolated model defaults: resolve isolated cron `subagents.model` (including object-form `primary`) through allowlist-aware model selection so isolated cron runs honor subagent model defaults unless explicitly overridden by job payload model. (#11474) Thanks @AnonO6.
8485
- Cron/Isolated sessions list: persist the intended pre-run model/provider on isolated cron session entries so `sessions_list` reflects payload/session model overrides even when runs fail before post-run telemetry persistence. (#21279) Thanks @altaywtf.
8586
- Cron/One-shot reliability: retry transient one-shot failures with bounded backoff and configurable retry policy before disabling. (#24435) Thanks .

src/cli/cron-cli.test.ts

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,4 +551,53 @@ describe("cron cli", () => {
551551
it("rejects --exact on edit when existing job is not cron", async () => {
552552
await expectCronEditWithScheduleLookupExit({ kind: "every", everyMs: 60_000 }, ["--exact"]);
553553
});
554+
555+
it("patches failure alert settings on cron edit", async () => {
556+
callGatewayFromCli.mockClear();
557+
558+
const program = buildProgram();
559+
560+
await program.parseAsync(
561+
[
562+
"cron",
563+
"edit",
564+
"job-1",
565+
"--failure-alert-after",
566+
"3",
567+
"--failure-alert-cooldown",
568+
"1h",
569+
"--failure-alert-channel",
570+
"telegram",
571+
"--failure-alert-to",
572+
"19098680",
573+
],
574+
{ from: "user" },
575+
);
576+
577+
const updateCall = callGatewayFromCli.mock.calls.find((call) => call[0] === "cron.update");
578+
const patch = updateCall?.[2] as {
579+
patch?: {
580+
failureAlert?: { after?: number; cooldownMs?: number; channel?: string; to?: string };
581+
};
582+
};
583+
584+
expect(patch?.patch?.failureAlert?.after).toBe(3);
585+
expect(patch?.patch?.failureAlert?.cooldownMs).toBe(3_600_000);
586+
expect(patch?.patch?.failureAlert?.channel).toBe("telegram");
587+
expect(patch?.patch?.failureAlert?.to).toBe("19098680");
588+
});
589+
590+
it("supports --no-failure-alert on cron edit", async () => {
591+
callGatewayFromCli.mockClear();
592+
593+
const program = buildProgram();
594+
595+
await program.parseAsync(["cron", "edit", "job-1", "--no-failure-alert"], {
596+
from: "user",
597+
});
598+
599+
const updateCall = callGatewayFromCli.mock.calls.find((call) => call[0] === "cron.update");
600+
const patch = updateCall?.[2] as { patch?: { failureAlert?: boolean } };
601+
expect(patch?.patch?.failureAlert).toBe(false);
602+
});
554603
});

src/cli/cron-cli/register.cron-edit.ts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,15 @@ export function registerCronEditCommand(cron: Command) {
6262
.option("--account <id>", "Channel account id for delivery (multi-account setups)")
6363
.option("--best-effort-deliver", "Do not fail job if delivery fails")
6464
.option("--no-best-effort-deliver", "Fail job when delivery fails")
65+
.option("--failure-alert", "Enable failure alerts for this job")
66+
.option("--no-failure-alert", "Disable failure alerts for this job")
67+
.option("--failure-alert-after <n>", "Alert after N consecutive job errors")
68+
.option(
69+
"--failure-alert-channel <channel>",
70+
`Failure alert channel (${getCronChannelOptions()})`,
71+
)
72+
.option("--failure-alert-to <dest>", "Failure alert destination")
73+
.option("--failure-alert-cooldown <duration>", "Minimum time between alerts (e.g. 1h, 30m)")
6574
.action(async (id, opts) => {
6675
try {
6776
if (opts.session === "main" && opts.message) {
@@ -264,6 +273,49 @@ export function registerCronEditCommand(cron: Command) {
264273
patch.delivery = delivery;
265274
}
266275

276+
const hasFailureAlertAfter = typeof opts.failureAlertAfter === "string";
277+
const hasFailureAlertChannel = typeof opts.failureAlertChannel === "string";
278+
const hasFailureAlertTo = typeof opts.failureAlertTo === "string";
279+
const hasFailureAlertCooldown = typeof opts.failureAlertCooldown === "string";
280+
const hasFailureAlertFields =
281+
hasFailureAlertAfter ||
282+
hasFailureAlertChannel ||
283+
hasFailureAlertTo ||
284+
hasFailureAlertCooldown;
285+
const failureAlertFlag =
286+
typeof opts.failureAlert === "boolean" ? opts.failureAlert : undefined;
287+
if (failureAlertFlag === false && hasFailureAlertFields) {
288+
throw new Error("Use --no-failure-alert alone (without failure-alert-* options).");
289+
}
290+
if (failureAlertFlag === false) {
291+
patch.failureAlert = false;
292+
} else if (failureAlertFlag === true || hasFailureAlertFields) {
293+
const failureAlert: Record<string, unknown> = {};
294+
if (hasFailureAlertAfter) {
295+
const after = Number.parseInt(String(opts.failureAlertAfter), 10);
296+
if (!Number.isFinite(after) || after <= 0) {
297+
throw new Error("Invalid --failure-alert-after (must be a positive integer).");
298+
}
299+
failureAlert.after = after;
300+
}
301+
if (hasFailureAlertChannel) {
302+
const channel = String(opts.failureAlertChannel).trim().toLowerCase();
303+
failureAlert.channel = channel ? channel : undefined;
304+
}
305+
if (hasFailureAlertTo) {
306+
const to = String(opts.failureAlertTo).trim();
307+
failureAlert.to = to ? to : undefined;
308+
}
309+
if (hasFailureAlertCooldown) {
310+
const cooldownMs = parseDurationMs(String(opts.failureAlertCooldown));
311+
if (!cooldownMs && cooldownMs !== 0) {
312+
throw new Error("Invalid --failure-alert-cooldown.");
313+
}
314+
failureAlert.cooldownMs = cooldownMs;
315+
}
316+
patch.failureAlert = failureAlert;
317+
}
318+
267319
const res = await callGatewayFromCli("cron.update", opts, {
268320
id,
269321
patch,

src/config/types.cron.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@ export type CronRetryConfig = {
1010
retryOn?: CronRetryOn[];
1111
};
1212

13+
export type CronFailureAlertConfig = {
14+
enabled?: boolean;
15+
after?: number;
16+
cooldownMs?: number;
17+
};
18+
1319
export type CronConfig = {
1420
enabled?: boolean;
1521
store?: string;
@@ -37,4 +43,5 @@ export type CronConfig = {
3743
maxBytes?: number | string;
3844
keepLines?: number;
3945
};
46+
failureAlert?: CronFailureAlertConfig;
4047
};

src/config/zod-schema.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,14 @@ export const OpenClawSchema = z
395395
})
396396
.strict()
397397
.optional(),
398+
failureAlert: z
399+
.object({
400+
enabled: z.boolean().optional(),
401+
after: z.number().int().min(1).optional(),
402+
cooldownMs: z.number().int().min(0).optional(),
403+
})
404+
.strict()
405+
.optional(),
398406
})
399407
.strict()
400408
.superRefine((val, ctx) => {
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import fs from "node:fs/promises";
2+
import os from "node:os";
3+
import path from "node:path";
4+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
5+
import { CronService } from "./service.js";
6+
7+
const noopLogger = {
8+
debug: vi.fn(),
9+
info: vi.fn(),
10+
warn: vi.fn(),
11+
error: vi.fn(),
12+
};
13+
14+
async function makeStorePath() {
15+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-cron-failure-alert-"));
16+
return {
17+
storePath: path.join(dir, "cron", "jobs.json"),
18+
cleanup: async () => {
19+
await fs.rm(dir, { recursive: true, force: true });
20+
},
21+
};
22+
}
23+
24+
describe("CronService failure alerts", () => {
25+
beforeEach(() => {
26+
vi.useFakeTimers();
27+
vi.setSystemTime(new Date("2026-01-01T00:00:00.000Z"));
28+
noopLogger.debug.mockClear();
29+
noopLogger.info.mockClear();
30+
noopLogger.warn.mockClear();
31+
noopLogger.error.mockClear();
32+
});
33+
34+
afterEach(() => {
35+
vi.useRealTimers();
36+
});
37+
38+
it("alerts after configured consecutive failures and honors cooldown", async () => {
39+
const store = await makeStorePath();
40+
const sendCronFailureAlert = vi.fn(async () => undefined);
41+
const runIsolatedAgentJob = vi.fn(async () => ({
42+
status: "error" as const,
43+
error: "wrong model id",
44+
}));
45+
46+
const cron = new CronService({
47+
storePath: store.storePath,
48+
cronEnabled: true,
49+
cronConfig: {
50+
failureAlert: {
51+
enabled: true,
52+
after: 2,
53+
cooldownMs: 60_000,
54+
},
55+
},
56+
log: noopLogger,
57+
enqueueSystemEvent: vi.fn(),
58+
requestHeartbeatNow: vi.fn(),
59+
runIsolatedAgentJob,
60+
sendCronFailureAlert,
61+
});
62+
63+
await cron.start();
64+
const job = await cron.add({
65+
name: "daily report",
66+
enabled: true,
67+
schedule: { kind: "every", everyMs: 60_000 },
68+
sessionTarget: "isolated",
69+
wakeMode: "next-heartbeat",
70+
payload: { kind: "agentTurn", message: "run report" },
71+
delivery: { mode: "announce", channel: "telegram", to: "19098680" },
72+
});
73+
74+
await cron.run(job.id, "force");
75+
expect(sendCronFailureAlert).not.toHaveBeenCalled();
76+
77+
await cron.run(job.id, "force");
78+
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
79+
expect(sendCronFailureAlert).toHaveBeenLastCalledWith(
80+
expect.objectContaining({
81+
job: expect.objectContaining({ id: job.id }),
82+
channel: "telegram",
83+
to: "19098680",
84+
text: expect.stringContaining('Cron job "daily report" failed 2 times'),
85+
}),
86+
);
87+
88+
await cron.run(job.id, "force");
89+
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
90+
91+
vi.advanceTimersByTime(60_000);
92+
await cron.run(job.id, "force");
93+
expect(sendCronFailureAlert).toHaveBeenCalledTimes(2);
94+
expect(sendCronFailureAlert).toHaveBeenLastCalledWith(
95+
expect.objectContaining({
96+
text: expect.stringContaining('Cron job "daily report" failed 4 times'),
97+
}),
98+
);
99+
100+
cron.stop();
101+
await store.cleanup();
102+
});
103+
104+
it("supports per-job failure alert override when global alerts are disabled", async () => {
105+
const store = await makeStorePath();
106+
const sendCronFailureAlert = vi.fn(async () => undefined);
107+
const runIsolatedAgentJob = vi.fn(async () => ({
108+
status: "error" as const,
109+
error: "timeout",
110+
}));
111+
112+
const cron = new CronService({
113+
storePath: store.storePath,
114+
cronEnabled: true,
115+
cronConfig: {
116+
failureAlert: {
117+
enabled: false,
118+
},
119+
},
120+
log: noopLogger,
121+
enqueueSystemEvent: vi.fn(),
122+
requestHeartbeatNow: vi.fn(),
123+
runIsolatedAgentJob,
124+
sendCronFailureAlert,
125+
});
126+
127+
await cron.start();
128+
const job = await cron.add({
129+
name: "job with override",
130+
enabled: true,
131+
schedule: { kind: "every", everyMs: 60_000 },
132+
sessionTarget: "isolated",
133+
wakeMode: "next-heartbeat",
134+
payload: { kind: "agentTurn", message: "run report" },
135+
failureAlert: {
136+
after: 1,
137+
channel: "telegram",
138+
to: "12345",
139+
cooldownMs: 1,
140+
},
141+
});
142+
143+
await cron.run(job.id, "force");
144+
expect(sendCronFailureAlert).toHaveBeenCalledTimes(1);
145+
expect(sendCronFailureAlert).toHaveBeenLastCalledWith(
146+
expect.objectContaining({
147+
channel: "telegram",
148+
to: "12345",
149+
}),
150+
);
151+
152+
cron.stop();
153+
await store.cleanup();
154+
});
155+
156+
it("respects per-job failureAlert=false and suppresses alerts", async () => {
157+
const store = await makeStorePath();
158+
const sendCronFailureAlert = vi.fn(async () => undefined);
159+
const runIsolatedAgentJob = vi.fn(async () => ({
160+
status: "error" as const,
161+
error: "auth error",
162+
}));
163+
164+
const cron = new CronService({
165+
storePath: store.storePath,
166+
cronEnabled: true,
167+
cronConfig: {
168+
failureAlert: {
169+
enabled: true,
170+
after: 1,
171+
},
172+
},
173+
log: noopLogger,
174+
enqueueSystemEvent: vi.fn(),
175+
requestHeartbeatNow: vi.fn(),
176+
runIsolatedAgentJob,
177+
sendCronFailureAlert,
178+
});
179+
180+
await cron.start();
181+
const job = await cron.add({
182+
name: "disabled alert job",
183+
enabled: true,
184+
schedule: { kind: "every", everyMs: 60_000 },
185+
sessionTarget: "isolated",
186+
wakeMode: "next-heartbeat",
187+
payload: { kind: "agentTurn", message: "run report" },
188+
failureAlert: false,
189+
});
190+
191+
await cron.run(job.id, "force");
192+
await cron.run(job.id, "force");
193+
expect(sendCronFailureAlert).not.toHaveBeenCalled();
194+
195+
cron.stop();
196+
await store.cleanup();
197+
});
198+
});

0 commit comments

Comments
 (0)