Skip to content

Commit 531e836

Browse files
authored
Agents: add fallback error observations (openclaw#41337)
Merged via squash. Prepared head SHA: 852469c Co-authored-by: altaywtf <[email protected]> Co-authored-by: altaywtf <[email protected]> Reviewed-by: @altaywtf
1 parent 3c34743 commit 531e836

17 files changed

+502
-36
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
3434
- ACP/regressions: add gateway RPC coverage for ACP lineage patching, ACPX runtime coverage for image prompt serialization, and an operator smoke-test procedure for live ACP spawn verification. (#41456) Thanks @mbelinky.
3535
- Agents/billing recovery: probe single-provider billing cooldowns on the existing throttle so topping up credits can recover without a manual gateway restart. (#41422) thanks @altaywtf.
3636
- ACP/follow-up hardening: make session restore and prompt completion degrade gracefully on transcript/update failures, enforce bounded tool-location traversal, and skip non-image ACPX turns the runtime cannot serialize. (#41464) Thanks @mbelinky.
37+
- Agents/fallback observability: add structured, sanitized model-fallback decision and auth-profile failure-state events with correlated run IDs so cooldown probes and failover paths are easier to trace in logs. (#41337) thanks @altaywtf.
3738

3839
## 2026.3.8
3940

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import { afterEach, describe, expect, it, vi } from "vitest";
2+
import { resetLogger, setLoggerOverride } from "../../logging/logger.js";
3+
import { logAuthProfileFailureStateChange } from "./state-observation.js";
4+
5+
afterEach(() => {
6+
setLoggerOverride(null);
7+
resetLogger();
8+
});
9+
10+
describe("logAuthProfileFailureStateChange", () => {
11+
it("sanitizes consoleMessage fields before logging", () => {
12+
const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
13+
setLoggerOverride({ level: "silent", consoleLevel: "warn" });
14+
15+
logAuthProfileFailureStateChange({
16+
runId: "run-1\nforged\tentry\rtest",
17+
profileId: "openai:profile-1",
18+
provider: "openai\u001b]8;;https://evil.test\u0007",
19+
reason: "overloaded",
20+
previous: undefined,
21+
next: {
22+
errorCount: 1,
23+
cooldownUntil: 1_700_000_060_000,
24+
failureCounts: { overloaded: 1 },
25+
},
26+
now: 1_700_000_000_000,
27+
});
28+
29+
const consoleLine = warnSpy.mock.calls[0]?.[0];
30+
expect(typeof consoleLine).toBe("string");
31+
expect(consoleLine).toContain("runId=run-1 forged entry test");
32+
expect(consoleLine).toContain("provider=openai]8;;https://evil.test");
33+
expect(consoleLine).not.toContain("\n");
34+
expect(consoleLine).not.toContain("\r");
35+
expect(consoleLine).not.toContain("\t");
36+
expect(consoleLine).not.toContain("\u001b");
37+
});
38+
});
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import { redactIdentifier } from "../../logging/redact-identifier.js";
2+
import { createSubsystemLogger } from "../../logging/subsystem.js";
3+
import { sanitizeForConsole } from "../pi-embedded-error-observation.js";
4+
import type { AuthProfileFailureReason, ProfileUsageStats } from "./types.js";
5+
6+
const observationLog = createSubsystemLogger("agent/embedded");
7+
8+
export function logAuthProfileFailureStateChange(params: {
9+
runId?: string;
10+
profileId: string;
11+
provider: string;
12+
reason: AuthProfileFailureReason;
13+
previous: ProfileUsageStats | undefined;
14+
next: ProfileUsageStats;
15+
now: number;
16+
}): void {
17+
const windowType =
18+
params.reason === "billing" || params.reason === "auth_permanent" ? "disabled" : "cooldown";
19+
const previousCooldownUntil = params.previous?.cooldownUntil;
20+
const previousDisabledUntil = params.previous?.disabledUntil;
21+
// Active cooldown/disable windows are intentionally immutable; log whether this
22+
// update reused the existing window instead of extending it.
23+
const windowReused =
24+
windowType === "disabled"
25+
? typeof previousDisabledUntil === "number" &&
26+
Number.isFinite(previousDisabledUntil) &&
27+
previousDisabledUntil > params.now &&
28+
previousDisabledUntil === params.next.disabledUntil
29+
: typeof previousCooldownUntil === "number" &&
30+
Number.isFinite(previousCooldownUntil) &&
31+
previousCooldownUntil > params.now &&
32+
previousCooldownUntil === params.next.cooldownUntil;
33+
const safeProfileId = redactIdentifier(params.profileId, { len: 12 });
34+
const safeRunId = sanitizeForConsole(params.runId) ?? "-";
35+
const safeProvider = sanitizeForConsole(params.provider) ?? "-";
36+
37+
observationLog.warn("auth profile failure state updated", {
38+
event: "auth_profile_failure_state_updated",
39+
tags: ["error_handling", "auth_profiles", windowType],
40+
runId: params.runId,
41+
profileId: safeProfileId,
42+
provider: params.provider,
43+
reason: params.reason,
44+
windowType,
45+
windowReused,
46+
previousErrorCount: params.previous?.errorCount,
47+
errorCount: params.next.errorCount,
48+
previousCooldownUntil,
49+
cooldownUntil: params.next.cooldownUntil,
50+
previousDisabledUntil,
51+
disabledUntil: params.next.disabledUntil,
52+
previousDisabledReason: params.previous?.disabledReason,
53+
disabledReason: params.next.disabledReason,
54+
failureCounts: params.next.failureCounts,
55+
consoleMessage:
56+
`auth profile failure state updated: runId=${safeRunId} profile=${safeProfileId} provider=${safeProvider} ` +
57+
`reason=${params.reason} window=${windowType} reused=${String(windowReused)}`,
58+
});
59+
}

src/agents/auth-profiles/usage.ts

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import type { OpenClawConfig } from "../../config/config.js";
22
import { normalizeProviderId } from "../model-selection.js";
3+
import { logAuthProfileFailureStateChange } from "./state-observation.js";
34
import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js";
45
import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js";
56

@@ -462,12 +463,16 @@ export async function markAuthProfileFailure(params: {
462463
reason: AuthProfileFailureReason;
463464
cfg?: OpenClawConfig;
464465
agentDir?: string;
466+
runId?: string;
465467
}): Promise<void> {
466-
const { store, profileId, reason, agentDir, cfg } = params;
468+
const { store, profileId, reason, agentDir, cfg, runId } = params;
467469
const profile = store.profiles[profileId];
468470
if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) {
469471
return;
470472
}
473+
let nextStats: ProfileUsageStats | undefined;
474+
let previousStats: ProfileUsageStats | undefined;
475+
let updateTime = 0;
471476
const updated = await updateAuthProfileStoreWithLock({
472477
agentDir,
473478
updater: (freshStore) => {
@@ -482,19 +487,32 @@ export async function markAuthProfileFailure(params: {
482487
providerId: providerKey,
483488
});
484489

485-
updateUsageStatsEntry(freshStore, profileId, (existing) =>
486-
computeNextProfileUsageStats({
487-
existing: existing ?? {},
488-
now,
489-
reason,
490-
cfgResolved,
491-
}),
492-
);
490+
previousStats = freshStore.usageStats?.[profileId];
491+
updateTime = now;
492+
const computed = computeNextProfileUsageStats({
493+
existing: previousStats ?? {},
494+
now,
495+
reason,
496+
cfgResolved,
497+
});
498+
nextStats = computed;
499+
updateUsageStatsEntry(freshStore, profileId, () => computed);
493500
return true;
494501
},
495502
});
496503
if (updated) {
497504
store.usageStats = updated.usageStats;
505+
if (nextStats) {
506+
logAuthProfileFailureStateChange({
507+
runId,
508+
profileId,
509+
provider: profile.provider,
510+
reason,
511+
previous: previousStats,
512+
next: nextStats,
513+
now: updateTime,
514+
});
515+
}
498516
return;
499517
}
500518
if (!store.profiles[profileId]) {
@@ -508,15 +526,25 @@ export async function markAuthProfileFailure(params: {
508526
providerId: providerKey,
509527
});
510528

511-
updateUsageStatsEntry(store, profileId, (existing) =>
512-
computeNextProfileUsageStats({
513-
existing: existing ?? {},
514-
now,
515-
reason,
516-
cfgResolved,
517-
}),
518-
);
529+
previousStats = store.usageStats?.[profileId];
530+
const computed = computeNextProfileUsageStats({
531+
existing: previousStats ?? {},
532+
now,
533+
reason,
534+
cfgResolved,
535+
});
536+
nextStats = computed;
537+
updateUsageStatsEntry(store, profileId, () => computed);
519538
saveAuthProfileStore(store, agentDir);
539+
logAuthProfileFailureStateChange({
540+
runId,
541+
profileId,
542+
provider: store.profiles[profileId]?.provider ?? profile.provider,
543+
reason,
544+
previous: previousStats,
545+
next: nextStats,
546+
now,
547+
});
520548
}
521549

522550
/**
@@ -528,12 +556,14 @@ export async function markAuthProfileCooldown(params: {
528556
store: AuthProfileStore;
529557
profileId: string;
530558
agentDir?: string;
559+
runId?: string;
531560
}): Promise<void> {
532561
await markAuthProfileFailure({
533562
store: params.store,
534563
profileId: params.profileId,
535564
reason: "unknown",
536565
agentDir: params.agentDir,
566+
runId: params.runId,
537567
});
538568
}
539569

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import { createSubsystemLogger } from "../logging/subsystem.js";
2+
import { sanitizeForLog } from "../terminal/ansi.js";
3+
import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
4+
import { buildTextObservationFields } from "./pi-embedded-error-observation.js";
5+
import type { FailoverReason } from "./pi-embedded-helpers.js";
6+
7+
const decisionLog = createSubsystemLogger("model-fallback").child("decision");
8+
9+
function buildErrorObservationFields(error?: string): {
10+
errorPreview?: string;
11+
errorHash?: string;
12+
errorFingerprint?: string;
13+
httpCode?: string;
14+
providerErrorType?: string;
15+
providerErrorMessagePreview?: string;
16+
requestIdHash?: string;
17+
} {
18+
const observed = buildTextObservationFields(error);
19+
return {
20+
errorPreview: observed.textPreview,
21+
errorHash: observed.textHash,
22+
errorFingerprint: observed.textFingerprint,
23+
httpCode: observed.httpCode,
24+
providerErrorType: observed.providerErrorType,
25+
providerErrorMessagePreview: observed.providerErrorMessagePreview,
26+
requestIdHash: observed.requestIdHash,
27+
};
28+
}
29+
30+
export function logModelFallbackDecision(params: {
31+
decision:
32+
| "skip_candidate"
33+
| "probe_cooldown_candidate"
34+
| "candidate_failed"
35+
| "candidate_succeeded";
36+
runId?: string;
37+
requestedProvider: string;
38+
requestedModel: string;
39+
candidate: ModelCandidate;
40+
attempt?: number;
41+
total?: number;
42+
reason?: FailoverReason | null;
43+
status?: number;
44+
code?: string;
45+
error?: string;
46+
nextCandidate?: ModelCandidate;
47+
isPrimary?: boolean;
48+
requestedModelMatched?: boolean;
49+
fallbackConfigured?: boolean;
50+
allowTransientCooldownProbe?: boolean;
51+
profileCount?: number;
52+
previousAttempts?: FallbackAttempt[];
53+
}): void {
54+
const nextText = params.nextCandidate
55+
? `${sanitizeForLog(params.nextCandidate.provider)}/${sanitizeForLog(params.nextCandidate.model)}`
56+
: "none";
57+
const reasonText = params.reason ?? "unknown";
58+
const observedError = buildErrorObservationFields(params.error);
59+
decisionLog.warn("model fallback decision", {
60+
event: "model_fallback_decision",
61+
tags: ["error_handling", "model_fallback", params.decision],
62+
runId: params.runId,
63+
decision: params.decision,
64+
requestedProvider: params.requestedProvider,
65+
requestedModel: params.requestedModel,
66+
candidateProvider: params.candidate.provider,
67+
candidateModel: params.candidate.model,
68+
attempt: params.attempt,
69+
total: params.total,
70+
reason: params.reason,
71+
status: params.status,
72+
code: params.code,
73+
...observedError,
74+
nextCandidateProvider: params.nextCandidate?.provider,
75+
nextCandidateModel: params.nextCandidate?.model,
76+
isPrimary: params.isPrimary,
77+
requestedModelMatched: params.requestedModelMatched,
78+
fallbackConfigured: params.fallbackConfigured,
79+
allowTransientCooldownProbe: params.allowTransientCooldownProbe,
80+
profileCount: params.profileCount,
81+
previousAttempts: params.previousAttempts?.map((attempt) => ({
82+
provider: attempt.provider,
83+
model: attempt.model,
84+
reason: attempt.reason,
85+
status: attempt.status,
86+
code: attempt.code,
87+
...buildErrorObservationFields(attempt.error),
88+
})),
89+
consoleMessage:
90+
`model fallback decision: decision=${params.decision} requested=${sanitizeForLog(params.requestedProvider)}/${sanitizeForLog(params.requestedModel)} ` +
91+
`candidate=${sanitizeForLog(params.candidate.provider)}/${sanitizeForLog(params.candidate.model)} reason=${reasonText} next=${nextText}`,
92+
});
93+
}

0 commit comments

Comments
 (0)