Skip to content

Commit 5f90883

Browse files
zerone0xaltaywtf
andauthored
fix(auth): reset cooldown error counters on expiry to prevent infinite escalation (#41028)
Merged via squash. Prepared head SHA: 89bd83f Co-authored-by: zerone0x <[email protected]> Co-authored-by: altaywtf <[email protected]> Reviewed-by: @altaywtf
1 parent 2b2e5e2 commit 5f90883

File tree

4 files changed

+77
-5
lines changed

4 files changed

+77
-5
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Docs: https://docs.openclaw.ai
2323
- Agents/embedded logs: add structured, sanitized lifecycle and failover observation events so overload and provider failures are easier to tail and filter. (#41336) thanks @altaywtf.
2424
- iOS/gateway foreground recovery: reconnect immediately on foreground return after stale background sockets are torn down, so the app no longer stays disconnected until a later wake path happens. (#41384) Thanks @mbelinky.
2525
- Cron/subagent followup: do not misclassify empty or `NO_REPLY` cron responses as interim acknowledgements that need a rerun, so deliberately silent cron jobs are no longer retried. (#41383) thanks @jackal092927.
26+
- Auth/cooldowns: reset expired auth-profile cooldown error counters before computing the next backoff so stale on-disk counters do not re-escalate into long cooldown loops after expiry. (#41028) thanks @zerone0x.
2627

2728
## 2026.3.8
2829

src/agents/auth-profiles.markauthprofilefailure.test.ts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,58 @@ describe("markAuthProfileFailure", () => {
190190
}
191191
});
192192

193+
it("resets error count when previous cooldown has expired to prevent escalation", async () => {
194+
const agentDir = fs.mkdtempSync(path.join(os.tmpdir(), "openclaw-auth-"));
195+
try {
196+
const authPath = path.join(agentDir, "auth-profiles.json");
197+
const now = Date.now();
198+
// Simulate state left on disk after 3 rapid failures within a 1-min cooldown
199+
// window. The cooldown has since expired, but clearExpiredCooldowns() only
200+
// ran in-memory and never persisted — so disk still carries errorCount: 3.
201+
fs.writeFileSync(
202+
authPath,
203+
JSON.stringify({
204+
version: 1,
205+
profiles: {
206+
"anthropic:default": {
207+
type: "api_key",
208+
provider: "anthropic",
209+
key: "sk-default",
210+
},
211+
},
212+
usageStats: {
213+
"anthropic:default": {
214+
errorCount: 3,
215+
failureCounts: { rate_limit: 3 },
216+
lastFailureAt: now - 120_000, // 2 minutes ago
217+
cooldownUntil: now - 60_000, // expired 1 minute ago
218+
},
219+
},
220+
}),
221+
);
222+
223+
const store = ensureAuthProfileStore(agentDir);
224+
await markAuthProfileFailure({
225+
store,
226+
profileId: "anthropic:default",
227+
reason: "rate_limit",
228+
agentDir,
229+
});
230+
231+
const stats = store.usageStats?.["anthropic:default"];
232+
// Error count should reset to 1 (not escalate to 4) because the
233+
// previous cooldown expired. Cooldown should be ~1 min, not ~60 min.
234+
expect(stats?.errorCount).toBe(1);
235+
expect(stats?.failureCounts?.rate_limit).toBe(1);
236+
const cooldownMs = (stats?.cooldownUntil ?? 0) - now;
237+
// calculateAuthProfileCooldownMs(1) = 60_000 (1 minute)
238+
expect(cooldownMs).toBeLessThan(120_000);
239+
expect(cooldownMs).toBeGreaterThan(0);
240+
} finally {
241+
fs.rmSync(agentDir, { recursive: true, force: true });
242+
}
243+
});
244+
193245
it("does not persist cooldown windows for OpenRouter profiles", async () => {
194246
await withAuthProfileStore(async ({ agentDir, store }) => {
195247
await markAuthProfileFailure({

src/agents/auth-profiles/usage.test.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,10 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
608608
});
609609
}
610610

611+
// When a cooldown/disabled window expires, the error count resets to prevent
612+
// stale counters from escalating the next cooldown (the root cause of
613+
// infinite cooldown loops — see #40989). The next failure should compute
614+
// backoff from errorCount=1, not from the accumulated stale count.
611615
const expiredWindowCases = [
612616
{
613617
label: "cooldownUntil",
@@ -617,7 +621,8 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
617621
errorCount: 3,
618622
lastFailureAt: now - 60_000,
619623
}),
620-
expectedUntil: (now: number) => now + 60 * 60 * 1000,
624+
// errorCount resets → calculateAuthProfileCooldownMs(1) = 60_000
625+
expectedUntil: (now: number) => now + 60_000,
621626
readUntil: (stats: WindowStats | undefined) => stats?.cooldownUntil,
622627
},
623628
{
@@ -630,7 +635,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
630635
failureCounts: { billing: 2 },
631636
lastFailureAt: now - 60_000,
632637
}),
633-
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
638+
// errorCount resets, billing count resets to 1 →
639+
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
640+
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
634641
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
635642
},
636643
{
@@ -643,7 +650,9 @@ describe("markAuthProfileFailure — active windows do not extend on retry", ()
643650
failureCounts: { auth_permanent: 2 },
644651
lastFailureAt: now - 60_000,
645652
}),
646-
expectedUntil: (now: number) => now + 20 * 60 * 60 * 1000,
653+
// errorCount resets, auth_permanent count resets to 1 →
654+
// calculateAuthProfileBillingDisableMsWithConfig(1, 5h, 24h) = 5h
655+
expectedUntil: (now: number) => now + 5 * 60 * 60 * 1000,
647656
readUntil: (stats: WindowStats | undefined) => stats?.disabledUntil,
648657
},
649658
];

src/agents/auth-profiles/usage.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,9 +400,19 @@ function computeNextProfileUsageStats(params: {
400400
params.existing.lastFailureAt > 0 &&
401401
params.now - params.existing.lastFailureAt > windowMs;
402402

403-
const baseErrorCount = windowExpired ? 0 : (params.existing.errorCount ?? 0);
403+
// If the previous cooldown has already expired, reset error counters so the
404+
// profile gets a fresh backoff window. clearExpiredCooldowns() does this
405+
// in-memory during profile ordering, but the on-disk state may still carry
406+
// the old counters when the lock-based updater reads a fresh store. Without
407+
// this check, stale error counts from an expired cooldown cause the next
408+
// failure to escalate to a much longer cooldown (e.g. 1 min → 25 min).
409+
const unusableUntil = resolveProfileUnusableUntil(params.existing);
410+
const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil;
411+
412+
const shouldResetCounters = windowExpired || previousCooldownExpired;
413+
const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0);
404414
const nextErrorCount = baseErrorCount + 1;
405-
const failureCounts = windowExpired ? {} : { ...params.existing.failureCounts };
415+
const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts };
406416
failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1;
407417

408418
const updatedStats: ProfileUsageStats = {

0 commit comments

Comments
 (0)