fix: repair sanitized replay tool results before send (#67620) (thanks @stainlu)

stainlu · obviyus · web-flow · commit c3c7a9953ff0 · 2026-04-16T18:38:57.000+05:30
* fix(agents): preserve native Anthropic tool IDs for hybrid providers Fixes #66892 MiniMax and other hybrid providers use api.minimaxi.com/anthropic (modelApi: anthropic-messages), which generates and expects native Anthropic tool_call_ids in toolu_* format. The hybrid replay policy (buildHybridAnthropicOrOpenAIReplayPolicy) applied strict sanitization that stripped underscores from these IDs, causing MiniMax to reject them with error 2013. The native Anthropic provider already preserved these IDs via preserveNativeAnthropicToolUseIds (added in 4613f12). This commit enables the same flag for the hybrid anthropic-messages branch, so toolu_* IDs pass through unsanitized while other synthetic IDs still get strict cleanup. * fix(agents): repair sanitized replay tool results before send * fix: repair sanitized replay tool results before send (#67620) (thanks @stainlu) * fix: preserve aborted-span tool results during replay sanitize (#67620) (thanks @stainlu) --------- Co-authored-by: Ayaan Zaidi <hi@obviy.us>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,6 +40,7 @@ Docs: https://docs.openclaw.ai
 - Agents/tool-loop: enable the unknown-tool stream guard by default. Previously `resolveUnknownToolGuardThreshold` returned `undefined` unless `tools.loopDetection.enabled` was explicitly set to `true`, which left the protection off in the default configuration. A hallucinated or removed tool (for example `himalaya` after it was dropped from `skills.allowBundled`) would then loop "Tool X not found" attempts until the full embedded-run timeout. The guard has no false-positive surface because it only triggers on tools that are objectively not registered in the run, so it now stays on regardless of `tools.loopDetection.enabled` and still accepts `tools.loopDetection.unknownToolThreshold` as a per-run override (default 10). (#67401) Thanks @xantorres.
 - TUI/streaming: add a client-side streaming watchdog to `tui-event-handlers` so the `streaming · Xm Ys` activity indicator resets to `idle` after 30s of delta silence on the active run. Guards against lost or late `state: "final"` chat events (WS reconnects, gateway restarts, etc.) leaving the TUI stuck on `streaming` indefinitely; a new system log line surfaces the reset so users know to send a new message to resync. The window is configurable via the new `streamingWatchdogMs` context option (set to `0` to disable), and the handler now exposes a `dispose()` that clears the pending timer on shutdown. (#67401) Thanks @xantorres.
 - Extensions/lmstudio: add exponential backoff to the inference-preload wrapper so an LM Studio model-load failure (for example the built-in memory guardrail rejecting a load because the swap is saturated) no longer produces a WARN line every ~2s for every chat request. The wrapper now records consecutive preload failures per `(baseUrl, modelKey, contextLength)` tuple with a 5s → 10s → 20s → … → 5min cooldown and skips the preload step entirely while a cooldown is active, letting chat requests proceed directly to the stream (the model is often already loaded via the LM Studio UI). The combined `preload failed` log line now reports consecutive-failure count and remaining cooldown so operators can act on the real issue instead of drowning in repeated warnings. (#67401) Thanks @xantorres.
+- Agents/replay: re-run tool/result pairing after strict replay tool-call ID sanitization on outbound requests so Anthropic-compatible providers like MiniMax no longer receive malformed orphan tool-result IDs such as `...toolresult1` during compaction and retry flows. (#67620) Thanks @stainlu.
 
 ## 2026.4.15-beta.1
 
diff --git a/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.test.ts b/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.test.ts
@@ -0,0 +1,107 @@
+import type { AgentMessage } from "@mariozechner/pi-agent-core";
+import { describe, expect, it } from "vitest";
+import { sanitizeReplayToolCallIdsForStream } from "./attempt.tool-call-normalization.js";
+
+describe("sanitizeReplayToolCallIdsForStream", () => {
+  it("drops orphaned tool results after strict id sanitization", () => {
+    const messages: AgentMessage[] = [
+      {
+        role: "toolResult",
+        toolCallId: "call_function_av7cbkigmk7x1",
+        toolUseId: "call_function_av7cbkigmk7x1",
+        toolName: "read",
+        content: [{ type: "text", text: "stale" }],
+        isError: false,
+      } as never,
+    ];
+
+    expect(
+      sanitizeReplayToolCallIdsForStream({
+        messages,
+        mode: "strict",
+        repairToolUseResultPairing: true,
+      }),
+    ).toEqual([]);
+  });
+
+  it("keeps matched assistant and tool-result ids aligned", () => {
+    const rawId = "call_function_av7cbkigmk7x1";
+    const messages: AgentMessage[] = [
+      {
+        role: "assistant",
+        content: [{ type: "toolUse", id: rawId, name: "read", input: { path: "." } }],
+      } as never,
+      {
+        role: "toolResult",
+        toolCallId: rawId,
+        toolUseId: rawId,
+        toolName: "read",
+        content: [{ type: "text", text: "ok" }],
+        isError: false,
+      } as never,
+    ];
+
+    const out = sanitizeReplayToolCallIdsForStream({
+      messages,
+      mode: "strict",
+      repairToolUseResultPairing: true,
+    });
+
+    expect(out).toMatchObject([
+      {
+        role: "assistant",
+        content: [{ type: "toolUse", id: "callfunctionav7cbkigmk7x1", name: "read" }],
+      },
+      {
+        role: "toolResult",
+        toolCallId: "callfunctionav7cbkigmk7x1",
+        toolUseId: "callfunctionav7cbkigmk7x1",
+        toolName: "read",
+      },
+    ]);
+  });
+
+  it("keeps real tool results for aborted assistant spans", () => {
+    const rawId = "call_function_av7cbkigmk7x1";
+    const out = sanitizeReplayToolCallIdsForStream({
+      messages: [
+        {
+          role: "assistant",
+          stopReason: "aborted",
+          content: [{ type: "toolUse", id: rawId, name: "read", input: { path: "." } }],
+        } as never,
+        {
+          role: "toolResult",
+          toolCallId: rawId,
+          toolUseId: rawId,
+          toolName: "read",
+          content: [{ type: "text", text: "partial" }],
+          isError: false,
+        } as never,
+        {
+          role: "user",
+          content: [{ type: "text", text: "retry" }],
+        } as never,
+      ],
+      mode: "strict",
+      repairToolUseResultPairing: true,
+    });
+
+    expect(out).toMatchObject([
+      {
+        role: "assistant",
+        stopReason: "aborted",
+        content: [{ type: "toolUse", id: "callfunctionav7cbkigmk7x1", name: "read" }],
+      },
+      {
+        role: "toolResult",
+        toolCallId: "callfunctionav7cbkigmk7x1",
+        toolUseId: "callfunctionav7cbkigmk7x1",
+        toolName: "read",
+      },
+      {
+        role: "user",
+      },
+    ]);
+  });
+});
diff --git a/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.ts b/src/agents/pi-embedded-runner/run/attempt.tool-call-normalization.ts
@@ -6,7 +6,11 @@ import {
   isRedactedSessionsSpawnAttachment,
   sanitizeToolUseResultPairing,
 } from "../../session-transcript-repair.js";
-import { extractToolCallsFromAssistant } from "../../tool-call-id.js";
+import {
+  extractToolCallsFromAssistant,
+  sanitizeToolCallIdsForCloudCodeAssist,
+  type ToolCallIdMode,
+} from "../../tool-call-id.js";
 import { normalizeToolName } from "../../tool-policy.js";
 import { shouldAllowProviderOwnedThinkingReplay } from "../../transcript-policy.js";
 import type { TranscriptPolicy } from "../../transcript-policy.js";
@@ -868,6 +872,25 @@ export function wrapStreamFnTrimToolCallNames(
   };
 }
 
+export function sanitizeReplayToolCallIdsForStream(params: {
+  messages: AgentMessage[];
+  mode: ToolCallIdMode;
+  allowedToolNames?: Set<string>;
+  preserveNativeAnthropicToolUseIds?: boolean;
+  preserveReplaySafeThinkingToolCallIds?: boolean;
+  repairToolUseResultPairing?: boolean;
+}): AgentMessage[] {
+  const sanitized = sanitizeToolCallIdsForCloudCodeAssist(params.messages, params.mode, {
+    preserveNativeAnthropicToolUseIds: params.preserveNativeAnthropicToolUseIds,
+    preserveReplaySafeThinkingToolCallIds: params.preserveReplaySafeThinkingToolCallIds,
+    allowedToolNames: params.allowedToolNames,
+  });
+  if (!params.repairToolUseResultPairing) {
+    return sanitized;
+  }
+  return sanitizeToolUseResultPairing(sanitized);
+}
+
 export function wrapStreamFnSanitizeMalformedToolCalls(
   baseFn: StreamFn,
   allowedToolNames?: Set<string>,
diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts
@@ -115,7 +115,6 @@ import { resolveSystemPromptOverride } from "../../system-prompt-override.js";
 import { buildSystemPromptParams } from "../../system-prompt-params.js";
 import { buildSystemPromptReport } from "../../system-prompt-report.js";
 import { resolveAgentTimeoutMs } from "../../timeout.js";
-import { sanitizeToolCallIdsForCloudCodeAssist } from "../../tool-call-id.js";
 import { UNKNOWN_TOOL_THRESHOLD } from "../../tool-loop-detection.js";
 import {
   resolveTranscriptPolicy,
@@ -225,6 +224,7 @@ import {
   wrapStreamFnRepairMalformedToolCallArguments,
 } from "./attempt.tool-call-argument-repair.js";
 import {
+  sanitizeReplayToolCallIdsForStream,
   wrapStreamFnSanitizeMalformedToolCalls,
   wrapStreamFnTrimToolCallNames,
 } from "./attempt.tool-call-normalization.js";
@@ -1251,25 +1251,23 @@ export async function runEmbeddedAttempt(
           if (!Array.isArray(messages)) {
             return inner(model, context, options);
           }
-          const allowProviderOwnedThinkingReplay = shouldAllowProviderOwnedThinkingReplay({
-            modelApi: (model as { api?: unknown })?.api as string | null | undefined,
-            policy: transcriptPolicy,
-          });
-          const sanitized = sanitizeToolCallIdsForCloudCodeAssist(
-            messages as AgentMessage[],
+          const nextMessages = sanitizeReplayToolCallIdsForStream({
+            messages: messages as AgentMessage[],
             mode,
-            {
-              preserveNativeAnthropicToolUseIds: transcriptPolicy.preserveNativeAnthropicToolUseIds,
-              preserveReplaySafeThinkingToolCallIds: allowProviderOwnedThinkingReplay,
-              allowedToolNames,
-            },
-          );
-          if (sanitized === messages) {
+            allowedToolNames,
+            preserveNativeAnthropicToolUseIds: transcriptPolicy.preserveNativeAnthropicToolUseIds,
+            preserveReplaySafeThinkingToolCallIds: shouldAllowProviderOwnedThinkingReplay({
+              modelApi: (model as { api?: unknown })?.api as string | null | undefined,
+              policy: transcriptPolicy,
+            }),
+            repairToolUseResultPairing: transcriptPolicy.repairToolUseResultPairing,
+          });
+          if (nextMessages === messages) {
             return inner(model, context, options);
           }
           const nextContext = {
             ...(context as unknown as Record<string, unknown>),
-            messages: sanitized,
+            messages: nextMessages,
           } as unknown;
           return inner(model, nextContext as typeof context, options);
         };
diff --git a/src/plugins/provider-replay-helpers.test.ts b/src/plugins/provider-replay-helpers.test.ts
@@ -93,7 +93,6 @@ describe("provider replay helpers", () => {
   });
 
   it("builds hybrid anthropic or openai replay policy", () => {
-    // Sonnet 4.6 preserves thinking blocks even when flag is set
     const sonnet46Policy = buildHybridAnthropicOrOpenAIReplayPolicy(
       {
         provider: "minimax",
@@ -107,7 +106,6 @@ describe("provider replay helpers", () => {
     });
     expect(sonnet46Policy).not.toHaveProperty("dropThinkingBlocks");
 
-    // Legacy model still drops
     expect(
       buildHybridAnthropicOrOpenAIReplayPolicy(
         {

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,6 @@ describe("provider replay helpers", () => {`
`93`	`93`	`});`
`94`	`94`
`95`	`95`	`it("builds hybrid anthropic or openai replay policy", () => {`
`96`		`- // Sonnet 4.6 preserves thinking blocks even when flag is set`
`97`	`96`	`const sonnet46Policy = buildHybridAnthropicOrOpenAIReplayPolicy(`
`98`	`97`	`{`
`99`	`98`	`provider: "minimax",`
`@@ -107,7 +106,6 @@ describe("provider replay helpers", () => {`
`107`	`106`	`});`
`108`	`107`	`expect(sonnet46Policy).not.toHaveProperty("dropThinkingBlocks");`
`109`	`108`
`110`		`- // Legacy model still drops`
`111`	`109`	`expect(`
`112`	`110`	`buildHybridAnthropicOrOpenAIReplayPolicy(`
`113`	`111`	`{`