fix(openai): route Codex audio to transcription model

vincentkoc · web-flow · commit 2cb03ee7b526 · 2026-05-04T17:14:08.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai
 ### Changes
 
 - Plugins/migration: emit catalog-backed install hints when `plugins.entries` or `plugins.allow` references an official external plugin that is not installed, so upgraded configs point operators to `openclaw plugins install <spec>` instead of telling them to remove valid plugin config. (#77483) Thanks @hclsys.
+- OpenAI/Codex media: advertise Codex audio transcription in runtime and manifest metadata and route active Codex chat models to the OpenAI transcription default instead of sending chat model ids to audio transcription. Thanks @vincentkoc.
 - Dependencies: refresh runtime and provider packages including Pi 0.73.0, ACPX adapters, OpenAI, Anthropic, Slack, and TypeScript native preview, while keeping the Bedrock runtime installer override pinned below the Windows ARM Node 24 npm resolver failure.
 - Agents/performance: pass the resolved workspace through BTW, compaction, embedded-run model generation, and PDF model setup so explicit agent-dir model refreshes can reuse the current workspace-scoped plugin metadata snapshot instead of falling back to cold plugin metadata scans. (#77519, #77532)
 - Config/plugin auto-enable: prefer the claiming plugin manifest id over a built-in channel alias when auto-allowlisting a configured channel, so WeCom/Yuanbao-style aliases resolve to the installed plugin id. Thanks @Beandon13.
diff --git a/extensions/openai/media-understanding-provider.test.ts b/extensions/openai/media-understanding-provider.test.ts
@@ -4,10 +4,26 @@ import {
   installPinnedHostnameTestHooks,
 } from "openclaw/plugin-sdk/test-env";
 import { describe, expect, it } from "vitest";
-import { transcribeOpenAiAudio } from "./media-understanding-provider.js";
+import {
+  openaiCodexMediaUnderstandingProvider,
+  transcribeOpenAiAudio,
+  transcribeOpenAiCodexAudio,
+} from "./media-understanding-provider.js";
 
 installPinnedHostnameTestHooks();
 
+describe("openaiCodexMediaUnderstandingProvider", () => {
+  it("declares audio support with the transcription default", () => {
+    expect(openaiCodexMediaUnderstandingProvider.capabilities).toEqual(["image", "audio"]);
+    expect(openaiCodexMediaUnderstandingProvider.defaultModels).toEqual({
+      image: "gpt-5.5",
+      audio: "gpt-4o-transcribe",
+    });
+    expect(openaiCodexMediaUnderstandingProvider.autoPriority).toEqual({ image: 20, audio: 20 });
+    expect(openaiCodexMediaUnderstandingProvider.transcribeAudio).toBe(transcribeOpenAiCodexAudio);
+  });
+});
+
 describe("transcribeOpenAiAudio", () => {
   it("respects lowercase authorization header overrides", async () => {
     const { fetchFn, getAuthHeader } = createAuthCaptureJsonFetch({ text: "ok" });
@@ -82,3 +98,22 @@ describe("transcribeOpenAiAudio", () => {
     ).rejects.toThrow("Audio transcription response missing text");
   });
 });
+
+describe("transcribeOpenAiCodexAudio", () => {
+  it("uses the OpenAI transcription default through the Codex provider id", async () => {
+    const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "hello" });
+
+    const result = await transcribeOpenAiCodexAudio({
+      buffer: Buffer.from("audio-bytes"),
+      fileName: "voice.wav",
+      apiKey: "test-key",
+      timeoutMs: 1234,
+      model: " ",
+      fetchFn,
+    });
+
+    const form = getRequest().init?.body as FormData;
+    expect(result.model).toBe("gpt-4o-transcribe");
+    expect(form.get("model")).toBe("gpt-4o-transcribe");
+  });
+});
diff --git a/extensions/openai/media-understanding-provider.ts b/extensions/openai/media-understanding-provider.ts
@@ -18,6 +18,15 @@ export async function transcribeOpenAiAudio(params: AudioTranscriptionRequest) {
   });
 }
 
+export async function transcribeOpenAiCodexAudio(params: AudioTranscriptionRequest) {
+  return await transcribeOpenAiCompatibleAudio({
+    ...params,
+    provider: "openai-codex",
+    defaultBaseUrl: DEFAULT_OPENAI_AUDIO_BASE_URL,
+    defaultModel: OPENAI_DEFAULT_AUDIO_TRANSCRIPTION_MODEL,
+  });
+}
+
 export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
   id: "openai",
   capabilities: ["image", "audio"],
@@ -33,9 +42,10 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
 
 export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = {
   id: "openai-codex",
-  capabilities: ["image"],
-  defaultModels: { image: "gpt-5.5" },
-  autoPriority: { image: 20 },
+  capabilities: ["image", "audio"],
+  defaultModels: { image: "gpt-5.5", audio: OPENAI_DEFAULT_AUDIO_TRANSCRIPTION_MODEL },
+  autoPriority: { image: 20, audio: 20 },
   describeImage: describeImageWithModel,
   describeImages: describeImagesWithModel,
+  transcribeAudio: transcribeOpenAiCodexAudio,
 };
diff --git a/extensions/openai/openclaw.plugin.json b/extensions/openai/openclaw.plugin.json
@@ -789,9 +789,14 @@
       }
     },
     "openai-codex": {
-      "capabilities": ["image"],
+      "capabilities": ["image", "audio"],
       "defaultModels": {
-        "image": "gpt-5.5"
+        "image": "gpt-5.5",
+        "audio": "gpt-4o-transcribe"
+      },
+      "autoPriority": {
+        "image": 20,
+        "audio": 20
       }
     }
   },
diff --git a/extensions/openai/openclaw.plugin.test.ts b/extensions/openai/openclaw.plugin.test.ts
@@ -6,6 +6,14 @@ import { buildOpenAIProvider } from "./openai-provider.js";
 const manifest = JSON.parse(
   readFileSync(new URL("./openclaw.plugin.json", import.meta.url), "utf8"),
 ) as {
+  mediaUnderstandingProviderMetadata?: Record<
+    string,
+    {
+      capabilities?: string[];
+      defaultModels?: Record<string, string>;
+      autoPriority?: Record<string, number>;
+    }
+  >;
   providerAuthChoices?: Array<{
     provider?: string;
     method?: string;
@@ -72,6 +80,20 @@ describe("OpenAI plugin manifest", () => {
     expect(codexBrowserLogin?.deprecatedChoiceIds).toContain("openai-codex-import");
   });
 
+  it("keeps Codex media-understanding manifest metadata aligned with runtime audio support", () => {
+    expect(manifest.mediaUnderstandingProviderMetadata?.["openai-codex"]).toMatchObject({
+      capabilities: ["image", "audio"],
+      defaultModels: {
+        image: "gpt-5.5",
+        audio: "gpt-4o-transcribe",
+      },
+      autoPriority: {
+        image: 20,
+        audio: 20,
+      },
+    });
+  });
+
   it("labels OpenAI API key and Codex auth choices without stale mixed OAuth wording", () => {
     const choices = manifest.providerAuthChoices ?? [];
     const codexBrowserLogin = choices.find((choice) => choice.choiceId === "openai-codex");
diff --git a/src/media-understanding/defaults.test.ts b/src/media-understanding/defaults.test.ts
@@ -58,9 +58,9 @@ const mediaMetadataPlugins = vi.hoisted(() => [
         autoPriority: { image: 10, audio: 10 },
       },
       "openai-codex": {
-        capabilities: ["image"],
-        defaultModels: { image: "gpt-5.5" },
-        autoPriority: { image: 20 },
+        capabilities: ["image", "audio"],
+        defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
+        autoPriority: { image: 20, audio: 20 },
       },
       opencode: { capabilities: ["image"], defaultModels: { image: "gpt-5-nano" } },
       "opencode-go": { capabilities: ["image"], defaultModels: { image: "kimi-k2.6" } },
@@ -108,6 +108,9 @@ describe("resolveDefaultMediaModel", () => {
     expect(resolveDefaultMediaModel({ providerId: "mistral", capability: "audio" })).toBe(
       "voxtral-mini-latest",
     );
+    expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "audio" })).toBe(
+      "gpt-4o-transcribe",
+    );
   });
 
   it("resolves bundled image defaults beyond the historical core set", () => {
@@ -136,6 +139,7 @@ describe("resolveAutoMediaKeyProviders", () => {
   it("keeps the bundled audio fallback order", () => {
     expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([
       "openai",
+      "openai-codex",
       "xai",
       "google",
       "mistral",
diff --git a/src/media-understanding/runner.auto-audio.test.ts b/src/media-understanding/runner.auto-audio.test.ts
@@ -95,6 +95,52 @@ describe("runCapability auto audio entries", () => {
     expect(result.decision.outcome).toBe("success");
   });
 
+  it("uses the provider audio default instead of the active Codex chat model", async () => {
+    let runResult: Awaited<ReturnType<typeof runCapability>> | undefined;
+    let seenModel: string | undefined;
+
+    await withAudioFixture("openclaw-auto-audio-codex", async ({ ctx, media, cache }) => {
+      const providerRegistry = createProviderRegistry({
+        "openai-codex": {
+          id: "openai-codex",
+          capabilities: ["image", "audio"],
+          defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
+          transcribeAudio: async (req) => {
+            seenModel = req.model;
+            return { text: "codex audio", model: req.model ?? "unknown" };
+          },
+        },
+      });
+      const cfg = {
+        models: {
+          providers: {
+            "openai-codex": {
+              apiKey: "codex-test-key", // pragma: allowlist secret
+              models: [],
+            },
+          },
+        },
+      } as unknown as OpenClawConfig;
+
+      runResult = await runCapability({
+        capability: "audio",
+        cfg,
+        ctx,
+        attachments: cache,
+        media,
+        providerRegistry,
+        activeModel: { provider: "openai-codex", model: "gpt-5.5" },
+      });
+    });
+
+    expect(runResult?.outputs[0]).toMatchObject({
+      provider: "openai-codex",
+      model: "gpt-4o-transcribe",
+      text: "codex audio",
+    });
+    expect(seenModel).toBe("gpt-4o-transcribe");
+  });
+
   it("prefers provider keys over auto-detected local whisper", async () => {
     const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-"));
     try {
diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts
@@ -749,16 +749,24 @@ async function resolveActiveModelEntry(params: {
   if (!hasAuth) {
     return null;
   }
-  const model =
-    params.capability === "image"
-      ? await resolveAutoImageModelId({
-          cfg: params.cfg,
-          providerId,
-          providerRegistry: params.providerRegistry,
-          explicitModel: params.activeModel?.model,
-        })
-      : params.activeModel?.model;
-  if (params.capability === "image" && !model) {
+  let model: string | undefined;
+  if (params.capability === "image") {
+    model = await resolveAutoImageModelId({
+      cfg: params.cfg,
+      providerId,
+      providerRegistry: params.providerRegistry,
+      explicitModel: params.activeModel?.model,
+    });
+  } else if (params.capability === "audio") {
+    model = resolveDefaultMediaModelFromRegistry({
+      providerId,
+      capability: "audio",
+      providerRegistry: params.providerRegistry,
+    });
+  } else {
+    model = params.activeModel?.model;
+  }
+  if ((params.capability === "image" || params.capability === "audio") && !model) {
     return null;
   }
   return {

Original file line number	Diff line number	Diff line change
`@@ -789,9 +789,14 @@`
`789`	`789`	`}`
`790`	`790`	`},`
`791`	`791`	`"openai-codex": {`
`792`		`- "capabilities": ["image"],`
	`792`	`+ "capabilities": ["image", "audio"],`
`793`	`793`	`"defaultModels": {`
`794`		`- "image": "gpt-5.5"`
	`794`	`+ "image": "gpt-5.5",`
	`795`	`+ "audio": "gpt-4o-transcribe"`
	`796`	`+ },`
	`797`	`+ "autoPriority": {`
	`798`	`+ "image": 20,`
	`799`	`+ "audio": 20`
`795`	`800`	`}`
`796`	`801`	`}`
`797`	`802`	`},`