remoteclaw
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/agents/auth-profiles/state-observation.test.ts‎
Lines changed: 38 additions & 0 deletions b/‎src/agents/auth-profiles/state-observation.test.ts‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎src/agents/auth-profiles/state-observation.ts‎
Lines changed: 59 additions & 0 deletions b/‎src/agents/auth-profiles/state-observation.ts‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎src/agents/auth-profiles/usage.ts‎
Lines changed: 47 additions & 17 deletions b/‎src/agents/auth-profiles/usage.ts‎
Lines changed: 47 additions & 17 deletions
diff --git a/‎src/agents/model-fallback-observation.ts‎
Lines changed: 93 additions & 0 deletions b/‎src/agents/model-fallback-observation.ts‎
Lines changed: 93 additions & 0 deletions
@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
 - ACP/regressions: add gateway RPC coverage for ACP lineage patching, ACPX runtime coverage for image prompt serialization, and an operator smoke-test procedure for live ACP spawn verification. (#41456) Thanks @mbelinky.
 - Agents/billing recovery: probe single-provider billing cooldowns on the existing throttle so topping up credits can recover without a manual gateway restart. (#41422) thanks @altaywtf.
 - ACP/follow-up hardening: make session restore and prompt completion degrade gracefully on transcript/update failures, enforce bounded tool-location traversal, and skip non-image ACPX turns the runtime cannot serialize. (#41464) Thanks @mbelinky.
+- Agents/fallback observability: add structured, sanitized model-fallback decision and auth-profile failure-state events with correlated run IDs so cooldown probes and failover paths are easier to trace in logs. (#41337) thanks @altaywtf.
 
 ## 2026.3.8
 
 
@@ -0,0 +1,38 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { resetLogger, setLoggerOverride } from "../../logging/logger.js";
+import { logAuthProfileFailureStateChange } from "./state-observation.js";
+
+afterEach(() => {
+  setLoggerOverride(null);
+  resetLogger();
+});
+
+describe("logAuthProfileFailureStateChange", () => {
+  it("sanitizes consoleMessage fields before logging", () => {
+    const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
+    setLoggerOverride({ level: "silent", consoleLevel: "warn" });
+
+    logAuthProfileFailureStateChange({
+      runId: "run-1\nforged\tentry\rtest",
+      profileId: "openai:profile-1",
+      provider: "openai\u001b]8;;https://evil.test\u0007",
+      reason: "overloaded",
+      previous: undefined,
+      next: {
+        errorCount: 1,
+        cooldownUntil: 1_700_000_060_000,
+        failureCounts: { overloaded: 1 },
+      },
+      now: 1_700_000_000_000,
+    });
+
+    const consoleLine = warnSpy.mock.calls[0]?.[0];
+    expect(typeof consoleLine).toBe("string");
+    expect(consoleLine).toContain("runId=run-1 forged entry test");
+    expect(consoleLine).toContain("provider=openai]8;;https://evil.test");
+    expect(consoleLine).not.toContain("\n");
+    expect(consoleLine).not.toContain("\r");
+    expect(consoleLine).not.toContain("\t");
+    expect(consoleLine).not.toContain("\u001b");
+  });
+});
@@ -0,0 +1,59 @@
+import { redactIdentifier } from "../../logging/redact-identifier.js";
+import { createSubsystemLogger } from "../../logging/subsystem.js";
+import { sanitizeForConsole } from "../pi-embedded-error-observation.js";
+import type { AuthProfileFailureReason, ProfileUsageStats } from "./types.js";
+
+const observationLog = createSubsystemLogger("agent/embedded");
+
+export function logAuthProfileFailureStateChange(params: {
+  runId?: string;
+  profileId: string;
+  provider: string;
+  reason: AuthProfileFailureReason;
+  previous: ProfileUsageStats | undefined;
+  next: ProfileUsageStats;
+  now: number;
+}): void {
+  const windowType =
+    params.reason === "billing" || params.reason === "auth_permanent" ? "disabled" : "cooldown";
+  const previousCooldownUntil = params.previous?.cooldownUntil;
+  const previousDisabledUntil = params.previous?.disabledUntil;
+  // Active cooldown/disable windows are intentionally immutable; log whether this
+  // update reused the existing window instead of extending it.
+  const windowReused =
+    windowType === "disabled"
+      ? typeof previousDisabledUntil === "number" &&
+        Number.isFinite(previousDisabledUntil) &&
+        previousDisabledUntil > params.now &&
+        previousDisabledUntil === params.next.disabledUntil
+      : typeof previousCooldownUntil === "number" &&
+        Number.isFinite(previousCooldownUntil) &&
+        previousCooldownUntil > params.now &&
+        previousCooldownUntil === params.next.cooldownUntil;
+  const safeProfileId = redactIdentifier(params.profileId, { len: 12 });
+  const safeRunId = sanitizeForConsole(params.runId) ?? "-";
+  const safeProvider = sanitizeForConsole(params.provider) ?? "-";
+
+  observationLog.warn("auth profile failure state updated", {
+    event: "auth_profile_failure_state_updated",
+    tags: ["error_handling", "auth_profiles", windowType],
+    runId: params.runId,
+    profileId: safeProfileId,
+    provider: params.provider,
+    reason: params.reason,
+    windowType,
+    windowReused,
+    previousErrorCount: params.previous?.errorCount,
+    errorCount: params.next.errorCount,
+    previousCooldownUntil,
+    cooldownUntil: params.next.cooldownUntil,
+    previousDisabledUntil,
+    disabledUntil: params.next.disabledUntil,
+    previousDisabledReason: params.previous?.disabledReason,
+    disabledReason: params.next.disabledReason,
+    failureCounts: params.next.failureCounts,
+    consoleMessage:
+      `auth profile failure state updated: runId=${safeRunId} profile=${safeProfileId} provider=${safeProvider} ` +
+      `reason=${params.reason} window=${windowType} reused=${String(windowReused)}`,
+  });
+}
@@ -1,5 +1,6 @@
 import type { OpenClawConfig } from "../../config/config.js";
 import { normalizeProviderId } from "../model-selection.js";
+import { logAuthProfileFailureStateChange } from "./state-observation.js";
 import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js";
 import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js";
 
@@ -462,12 +463,16 @@ export async function markAuthProfileFailure(params: {
   reason: AuthProfileFailureReason;
   cfg?: OpenClawConfig;
   agentDir?: string;
+  runId?: string;
 }): Promise<void> {
-  const { store, profileId, reason, agentDir, cfg } = params;
+  const { store, profileId, reason, agentDir, cfg, runId } = params;
   const profile = store.profiles[profileId];
   if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) {
     return;
   }
+  let nextStats: ProfileUsageStats | undefined;
+  let previousStats: ProfileUsageStats | undefined;
+  let updateTime = 0;
   const updated = await updateAuthProfileStoreWithLock({
     agentDir,
     updater: (freshStore) => {
@@ -482,19 +487,32 @@ export async function markAuthProfileFailure(params: {
         providerId: providerKey,
       });
 
-      updateUsageStatsEntry(freshStore, profileId, (existing) =>
-        computeNextProfileUsageStats({
-          existing: existing ?? {},
-          now,
-          reason,
-          cfgResolved,
-        }),
-      );
+      previousStats = freshStore.usageStats?.[profileId];
+      updateTime = now;
+      const computed = computeNextProfileUsageStats({
+        existing: previousStats ?? {},
+        now,
+        reason,
+        cfgResolved,
+      });
+      nextStats = computed;
+      updateUsageStatsEntry(freshStore, profileId, () => computed);
       return true;
     },
   });
   if (updated) {
     store.usageStats = updated.usageStats;
+    if (nextStats) {
+      logAuthProfileFailureStateChange({
+        runId,
+        profileId,
+        provider: profile.provider,
+        reason,
+        previous: previousStats,
+        next: nextStats,
+        now: updateTime,
+      });
+    }
     return;
   }
   if (!store.profiles[profileId]) {
@@ -508,15 +526,25 @@ export async function markAuthProfileFailure(params: {
     providerId: providerKey,
   });
 
-  updateUsageStatsEntry(store, profileId, (existing) =>
-    computeNextProfileUsageStats({
-      existing: existing ?? {},
-      now,
-      reason,
-      cfgResolved,
-    }),
-  );
+  previousStats = store.usageStats?.[profileId];
+  const computed = computeNextProfileUsageStats({
+    existing: previousStats ?? {},
+    now,
+    reason,
+    cfgResolved,
+  });
+  nextStats = computed;
+  updateUsageStatsEntry(store, profileId, () => computed);
   saveAuthProfileStore(store, agentDir);
+  logAuthProfileFailureStateChange({
+    runId,
+    profileId,
+    provider: store.profiles[profileId]?.provider ?? profile.provider,
+    reason,
+    previous: previousStats,
+    next: nextStats,
+    now,
+  });
 }
 
 /**
@@ -528,12 +556,14 @@ export async function markAuthProfileCooldown(params: {
   store: AuthProfileStore;
   profileId: string;
   agentDir?: string;
+  runId?: string;
 }): Promise<void> {
   await markAuthProfileFailure({
     store: params.store,
     profileId: params.profileId,
     reason: "unknown",
     agentDir: params.agentDir,
+    runId: params.runId,
   });
 }
 
 
@@ -0,0 +1,93 @@
+import { createSubsystemLogger } from "../logging/subsystem.js";
+import { sanitizeForLog } from "../terminal/ansi.js";
+import type { FallbackAttempt, ModelCandidate } from "./model-fallback.types.js";
+import { buildTextObservationFields } from "./pi-embedded-error-observation.js";
+import type { FailoverReason } from "./pi-embedded-helpers.js";
+
+const decisionLog = createSubsystemLogger("model-fallback").child("decision");
+
+function buildErrorObservationFields(error?: string): {
+  errorPreview?: string;
+  errorHash?: string;
+  errorFingerprint?: string;
+  httpCode?: string;
+  providerErrorType?: string;
+  providerErrorMessagePreview?: string;
+  requestIdHash?: string;
+} {
+  const observed = buildTextObservationFields(error);
+  return {
+    errorPreview: observed.textPreview,
+    errorHash: observed.textHash,
+    errorFingerprint: observed.textFingerprint,
+    httpCode: observed.httpCode,
+    providerErrorType: observed.providerErrorType,
+    providerErrorMessagePreview: observed.providerErrorMessagePreview,
+    requestIdHash: observed.requestIdHash,
+  };
+}
+
+export function logModelFallbackDecision(params: {
+  decision:
+    | "skip_candidate"
+    | "probe_cooldown_candidate"
+    | "candidate_failed"
+    | "candidate_succeeded";
+  runId?: string;
+  requestedProvider: string;
+  requestedModel: string;
+  candidate: ModelCandidate;
+  attempt?: number;
+  total?: number;
+  reason?: FailoverReason | null;
+  status?: number;
+  code?: string;
+  error?: string;
+  nextCandidate?: ModelCandidate;
+  isPrimary?: boolean;
+  requestedModelMatched?: boolean;
+  fallbackConfigured?: boolean;
+  allowTransientCooldownProbe?: boolean;
+  profileCount?: number;
+  previousAttempts?: FallbackAttempt[];
+}): void {
+  const nextText = params.nextCandidate
+    ? `${sanitizeForLog(params.nextCandidate.provider)}/${sanitizeForLog(params.nextCandidate.model)}`
+    : "none";
+  const reasonText = params.reason ?? "unknown";
+  const observedError = buildErrorObservationFields(params.error);
+  decisionLog.warn("model fallback decision", {
+    event: "model_fallback_decision",
+    tags: ["error_handling", "model_fallback", params.decision],
+    runId: params.runId,
+    decision: params.decision,
+    requestedProvider: params.requestedProvider,
+    requestedModel: params.requestedModel,
+    candidateProvider: params.candidate.provider,
+    candidateModel: params.candidate.model,
+    attempt: params.attempt,
+    total: params.total,
+    reason: params.reason,
+    status: params.status,
+    code: params.code,
+    ...observedError,
+    nextCandidateProvider: params.nextCandidate?.provider,
+    nextCandidateModel: params.nextCandidate?.model,
+    isPrimary: params.isPrimary,
+    requestedModelMatched: params.requestedModelMatched,
+    fallbackConfigured: params.fallbackConfigured,
+    allowTransientCooldownProbe: params.allowTransientCooldownProbe,
+    profileCount: params.profileCount,
+    previousAttempts: params.previousAttempts?.map((attempt) => ({
+      provider: attempt.provider,
+      model: attempt.model,
+      reason: attempt.reason,
+      status: attempt.status,
+      code: attempt.code,
+      ...buildErrorObservationFields(attempt.error),
+    })),
+    consoleMessage:
+      `model fallback decision: decision=${params.decision} requested=${sanitizeForLog(params.requestedProvider)}/${sanitizeForLog(params.requestedModel)} ` +
+      `candidate=${sanitizeForLog(params.candidate.provider)}/${sanitizeForLog(params.candidate.model)} reason=${reasonText} next=${nextText}`,
+  });
+}