Skip to content

Commit 2cb03ee

Browse files
authored
fix(openai): route Codex audio to transcription model
1 parent a491090 commit 2cb03ee

8 files changed

Lines changed: 150 additions & 19 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Docs: https://docs.openclaw.ai
1111
### Changes
1212

1313
- Plugins/migration: emit catalog-backed install hints when `plugins.entries` or `plugins.allow` references an official external plugin that is not installed, so upgraded configs point operators to `openclaw plugins install <spec>` instead of telling them to remove valid plugin config. (#77483) Thanks @hclsys.
14+
- OpenAI/Codex media: advertise Codex audio transcription in runtime and manifest metadata and route active Codex chat models to the OpenAI transcription default instead of sending chat model ids to audio transcription. Thanks @vincentkoc.
1415
- Dependencies: refresh runtime and provider packages including Pi 0.73.0, ACPX adapters, OpenAI, Anthropic, Slack, and TypeScript native preview, while keeping the Bedrock runtime installer override pinned below the Windows ARM Node 24 npm resolver failure.
1516
- Agents/performance: pass the resolved workspace through BTW, compaction, embedded-run model generation, and PDF model setup so explicit agent-dir model refreshes can reuse the current workspace-scoped plugin metadata snapshot instead of falling back to cold plugin metadata scans. (#77519, #77532)
1617
- Config/plugin auto-enable: prefer the claiming plugin manifest id over a built-in channel alias when auto-allowlisting a configured channel, so WeCom/Yuanbao-style aliases resolve to the installed plugin id. Thanks @Beandon13.

extensions/openai/media-understanding-provider.test.ts

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,26 @@ import {
44
installPinnedHostnameTestHooks,
55
} from "openclaw/plugin-sdk/test-env";
66
import { describe, expect, it } from "vitest";
7-
import { transcribeOpenAiAudio } from "./media-understanding-provider.js";
7+
import {
8+
openaiCodexMediaUnderstandingProvider,
9+
transcribeOpenAiAudio,
10+
transcribeOpenAiCodexAudio,
11+
} from "./media-understanding-provider.js";
812

913
installPinnedHostnameTestHooks();
1014

15+
describe("openaiCodexMediaUnderstandingProvider", () => {
16+
it("declares audio support with the transcription default", () => {
17+
expect(openaiCodexMediaUnderstandingProvider.capabilities).toEqual(["image", "audio"]);
18+
expect(openaiCodexMediaUnderstandingProvider.defaultModels).toEqual({
19+
image: "gpt-5.5",
20+
audio: "gpt-4o-transcribe",
21+
});
22+
expect(openaiCodexMediaUnderstandingProvider.autoPriority).toEqual({ image: 20, audio: 20 });
23+
expect(openaiCodexMediaUnderstandingProvider.transcribeAudio).toBe(transcribeOpenAiCodexAudio);
24+
});
25+
});
26+
1127
describe("transcribeOpenAiAudio", () => {
1228
it("respects lowercase authorization header overrides", async () => {
1329
const { fetchFn, getAuthHeader } = createAuthCaptureJsonFetch({ text: "ok" });
@@ -82,3 +98,22 @@ describe("transcribeOpenAiAudio", () => {
8298
).rejects.toThrow("Audio transcription response missing text");
8399
});
84100
});
101+
102+
describe("transcribeOpenAiCodexAudio", () => {
103+
it("uses the OpenAI transcription default through the Codex provider id", async () => {
104+
const { fetchFn, getRequest } = createRequestCaptureJsonFetch({ text: "hello" });
105+
106+
const result = await transcribeOpenAiCodexAudio({
107+
buffer: Buffer.from("audio-bytes"),
108+
fileName: "voice.wav",
109+
apiKey: "test-key",
110+
timeoutMs: 1234,
111+
model: " ",
112+
fetchFn,
113+
});
114+
115+
const form = getRequest().init?.body as FormData;
116+
expect(result.model).toBe("gpt-4o-transcribe");
117+
expect(form.get("model")).toBe("gpt-4o-transcribe");
118+
});
119+
});

extensions/openai/media-understanding-provider.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@ export async function transcribeOpenAiAudio(params: AudioTranscriptionRequest) {
1818
});
1919
}
2020

21+
export async function transcribeOpenAiCodexAudio(params: AudioTranscriptionRequest) {
22+
return await transcribeOpenAiCompatibleAudio({
23+
...params,
24+
provider: "openai-codex",
25+
defaultBaseUrl: DEFAULT_OPENAI_AUDIO_BASE_URL,
26+
defaultModel: OPENAI_DEFAULT_AUDIO_TRANSCRIPTION_MODEL,
27+
});
28+
}
29+
2130
export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
2231
id: "openai",
2332
capabilities: ["image", "audio"],
@@ -33,9 +42,10 @@ export const openaiMediaUnderstandingProvider: MediaUnderstandingProvider = {
3342

3443
export const openaiCodexMediaUnderstandingProvider: MediaUnderstandingProvider = {
3544
id: "openai-codex",
36-
capabilities: ["image"],
37-
defaultModels: { image: "gpt-5.5" },
38-
autoPriority: { image: 20 },
45+
capabilities: ["image", "audio"],
46+
defaultModels: { image: "gpt-5.5", audio: OPENAI_DEFAULT_AUDIO_TRANSCRIPTION_MODEL },
47+
autoPriority: { image: 20, audio: 20 },
3948
describeImage: describeImageWithModel,
4049
describeImages: describeImagesWithModel,
50+
transcribeAudio: transcribeOpenAiCodexAudio,
4151
};

extensions/openai/openclaw.plugin.json

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -789,9 +789,14 @@
789789
}
790790
},
791791
"openai-codex": {
792-
"capabilities": ["image"],
792+
"capabilities": ["image", "audio"],
793793
"defaultModels": {
794-
"image": "gpt-5.5"
794+
"image": "gpt-5.5",
795+
"audio": "gpt-4o-transcribe"
796+
},
797+
"autoPriority": {
798+
"image": 20,
799+
"audio": 20
795800
}
796801
}
797802
},

extensions/openai/openclaw.plugin.test.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@ import { buildOpenAIProvider } from "./openai-provider.js";
66
const manifest = JSON.parse(
77
readFileSync(new URL("./openclaw.plugin.json", import.meta.url), "utf8"),
88
) as {
9+
mediaUnderstandingProviderMetadata?: Record<
10+
string,
11+
{
12+
capabilities?: string[];
13+
defaultModels?: Record<string, string>;
14+
autoPriority?: Record<string, number>;
15+
}
16+
>;
917
providerAuthChoices?: Array<{
1018
provider?: string;
1119
method?: string;
@@ -72,6 +80,20 @@ describe("OpenAI plugin manifest", () => {
7280
expect(codexBrowserLogin?.deprecatedChoiceIds).toContain("openai-codex-import");
7381
});
7482

83+
it("keeps Codex media-understanding manifest metadata aligned with runtime audio support", () => {
84+
expect(manifest.mediaUnderstandingProviderMetadata?.["openai-codex"]).toMatchObject({
85+
capabilities: ["image", "audio"],
86+
defaultModels: {
87+
image: "gpt-5.5",
88+
audio: "gpt-4o-transcribe",
89+
},
90+
autoPriority: {
91+
image: 20,
92+
audio: 20,
93+
},
94+
});
95+
});
96+
7597
it("labels OpenAI API key and Codex auth choices without stale mixed OAuth wording", () => {
7698
const choices = manifest.providerAuthChoices ?? [];
7799
const codexBrowserLogin = choices.find((choice) => choice.choiceId === "openai-codex");

src/media-understanding/defaults.test.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ const mediaMetadataPlugins = vi.hoisted(() => [
5858
autoPriority: { image: 10, audio: 10 },
5959
},
6060
"openai-codex": {
61-
capabilities: ["image"],
62-
defaultModels: { image: "gpt-5.5" },
63-
autoPriority: { image: 20 },
61+
capabilities: ["image", "audio"],
62+
defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
63+
autoPriority: { image: 20, audio: 20 },
6464
},
6565
opencode: { capabilities: ["image"], defaultModels: { image: "gpt-5-nano" } },
6666
"opencode-go": { capabilities: ["image"], defaultModels: { image: "kimi-k2.6" } },
@@ -108,6 +108,9 @@ describe("resolveDefaultMediaModel", () => {
108108
expect(resolveDefaultMediaModel({ providerId: "mistral", capability: "audio" })).toBe(
109109
"voxtral-mini-latest",
110110
);
111+
expect(resolveDefaultMediaModel({ providerId: "openai-codex", capability: "audio" })).toBe(
112+
"gpt-4o-transcribe",
113+
);
111114
});
112115

113116
it("resolves bundled image defaults beyond the historical core set", () => {
@@ -136,6 +139,7 @@ describe("resolveAutoMediaKeyProviders", () => {
136139
it("keeps the bundled audio fallback order", () => {
137140
expect(resolveAutoMediaKeyProviders({ capability: "audio" })).toEqual([
138141
"openai",
142+
"openai-codex",
139143
"xai",
140144
"google",
141145
"mistral",

src/media-understanding/runner.auto-audio.test.ts

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,52 @@ describe("runCapability auto audio entries", () => {
9595
expect(result.decision.outcome).toBe("success");
9696
});
9797

98+
it("uses the provider audio default instead of the active Codex chat model", async () => {
99+
let runResult: Awaited<ReturnType<typeof runCapability>> | undefined;
100+
let seenModel: string | undefined;
101+
102+
await withAudioFixture("openclaw-auto-audio-codex", async ({ ctx, media, cache }) => {
103+
const providerRegistry = createProviderRegistry({
104+
"openai-codex": {
105+
id: "openai-codex",
106+
capabilities: ["image", "audio"],
107+
defaultModels: { image: "gpt-5.5", audio: "gpt-4o-transcribe" },
108+
transcribeAudio: async (req) => {
109+
seenModel = req.model;
110+
return { text: "codex audio", model: req.model ?? "unknown" };
111+
},
112+
},
113+
});
114+
const cfg = {
115+
models: {
116+
providers: {
117+
"openai-codex": {
118+
apiKey: "codex-test-key", // pragma: allowlist secret
119+
models: [],
120+
},
121+
},
122+
},
123+
} as unknown as OpenClawConfig;
124+
125+
runResult = await runCapability({
126+
capability: "audio",
127+
cfg,
128+
ctx,
129+
attachments: cache,
130+
media,
131+
providerRegistry,
132+
activeModel: { provider: "openai-codex", model: "gpt-5.5" },
133+
});
134+
});
135+
136+
expect(runResult?.outputs[0]).toMatchObject({
137+
provider: "openai-codex",
138+
model: "gpt-4o-transcribe",
139+
text: "codex audio",
140+
});
141+
expect(seenModel).toBe("gpt-4o-transcribe");
142+
});
143+
98144
it("prefers provider keys over auto-detected local whisper", async () => {
99145
const binDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-auto-audio-bin-"));
100146
try {

src/media-understanding/runner.ts

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -749,16 +749,24 @@ async function resolveActiveModelEntry(params: {
749749
if (!hasAuth) {
750750
return null;
751751
}
752-
const model =
753-
params.capability === "image"
754-
? await resolveAutoImageModelId({
755-
cfg: params.cfg,
756-
providerId,
757-
providerRegistry: params.providerRegistry,
758-
explicitModel: params.activeModel?.model,
759-
})
760-
: params.activeModel?.model;
761-
if (params.capability === "image" && !model) {
752+
let model: string | undefined;
753+
if (params.capability === "image") {
754+
model = await resolveAutoImageModelId({
755+
cfg: params.cfg,
756+
providerId,
757+
providerRegistry: params.providerRegistry,
758+
explicitModel: params.activeModel?.model,
759+
});
760+
} else if (params.capability === "audio") {
761+
model = resolveDefaultMediaModelFromRegistry({
762+
providerId,
763+
capability: "audio",
764+
providerRegistry: params.providerRegistry,
765+
});
766+
} else {
767+
model = params.activeModel?.model;
768+
}
769+
if ((params.capability === "image" || params.capability === "audio") && !model) {
762770
return null;
763771
}
764772
return {

0 commit comments

Comments
 (0)