feat(ai): implement Vercel AI Gateway prompt caching in OpenAICompatibleRuntime

Innei · Innei · commit 8c2afefa7a71 · 2026-03-23T19:45:51.000+08:00
- Added baseURL detection for Vercel AI Gateway to enable automatic caching for chat completion requests.
- Introduced helper methods to decorate request parameters with `providerOptions.gateway.caching = 'auto'` for compatible endpoints.
- Updated `generateText`, `generateStructured`, and `generateTextStream` methods to utilize the new caching logic.
- Created unit tests to verify caching behavior for both gateway and non-gateway endpoints.

Signed-off-by: Innei &lt;tukon479@gmail.com&gt;
diff --git a/apps/core/src/modules/ai/runtime/openai-compatible.runtime.ts b/apps/core/src/modules/ai/runtime/openai-compatible.runtime.ts
@@ -21,6 +21,7 @@ import type {
 export class OpenAICompatibleRuntime extends BaseRuntime {
   readonly providerInfo: RuntimeProviderInfo
   private readonly client: OpenAI
+  private readonly baseURL: string
 
   constructor(config: RuntimeConfig) {
     super()
@@ -31,6 +32,7 @@ export class OpenAICompatibleRuntime extends BaseRuntime {
     }
 
     const baseURL = this.resolveBaseURL(config)
+    this.baseURL = baseURL
     this.client = new OpenAI({
       apiKey: config.apiKey,
       baseURL,
@@ -62,6 +64,38 @@ export class OpenAICompatibleRuntime extends BaseRuntime {
     return normalized
   }
 
+  private isVercelAiGateway(): boolean {
+    try {
+      return new URL(this.baseURL).hostname === 'ai-gateway.vercel.sh'
+    } catch {
+      return false
+    }
+  }
+
+  private withGatewayPromptCache<T extends Record<string, unknown>>(
+    params: T,
+  ): T {
+    if (!this.isVercelAiGateway()) {
+      return params
+    }
+
+    const providerOptions = (
+      params as { providerOptions?: Record<string, any> }
+    ).providerOptions
+    const gatewayOptions = providerOptions?.gateway
+
+    return {
+      ...params,
+      providerOptions: {
+        ...providerOptions,
+        gateway: {
+          ...gatewayOptions,
+          caching: 'auto',
+        },
+      },
+    }
+  }
+
   async generateText(
     options: GenerateTextOptions,
   ): Promise<GenerateTextResult> {
@@ -84,13 +118,15 @@ export class OpenAICompatibleRuntime extends BaseRuntime {
         : undefined
 
     return this.withRetry(async () => {
-      const response = await this.client.chat.completions.create({
-        model: this.providerInfo.model,
-        messages: chatMessages,
-        temperature,
-        max_tokens: maxTokens,
-        reasoning_effort: openaiReasoningEffort,
-      } as OpenAI.ChatCompletionCreateParamsNonStreaming)
+      const response = await this.client.chat.completions.create(
+        this.withGatewayPromptCache({
+          model: this.providerInfo.model,
+          messages: chatMessages,
+          temperature,
+          max_tokens: maxTokens,
+          reasoning_effort: openaiReasoningEffort,
+        }) as OpenAI.ChatCompletionCreateParamsNonStreaming,
+      )
 
       const choice = response.choices[0]
       return {
@@ -163,14 +199,16 @@ export class OpenAICompatibleRuntime extends BaseRuntime {
       }
 
       for (let i = 0; i < maxIterations; i++) {
-        const response = await this.client.chat.completions.create({
-          model: this.providerInfo.model,
-          messages: conversationMessages,
-          temperature,
-          max_tokens: maxTokens,
-          reasoning_effort: openaiReasoningEffort,
-          ...toolConfig,
-        } as OpenAI.ChatCompletionCreateParamsNonStreaming)
+        const response = await this.client.chat.completions.create(
+          this.withGatewayPromptCache({
+            model: this.providerInfo.model,
+            messages: conversationMessages,
+            temperature,
+            max_tokens: maxTokens,
+            reasoning_effort: openaiReasoningEffort,
+            ...toolConfig,
+          }) as OpenAI.ChatCompletionCreateParamsNonStreaming,
+        )
 
         if (response.usage) {
           totalUsage.promptTokens += response.usage.prompt_tokens
@@ -231,14 +269,14 @@ export class OpenAICompatibleRuntime extends BaseRuntime {
         : undefined
 
     const response = await this.client.chat.completions.create(
-      {
+      this.withGatewayPromptCache({
         model: this.providerInfo.model,
         messages: chatMessages,
         temperature,
         max_tokens: maxTokens,
         stream: true,
         reasoning_effort: openaiReasoningEffort,
-      } as OpenAI.ChatCompletionCreateParams & { stream: true },
+      }) as OpenAI.ChatCompletionCreateParams & { stream: true },
       { signal },
     )
 
diff --git a/apps/core/test/src/modules/ai/openai-compatible.runtime.spec.ts b/apps/core/test/src/modules/ai/openai-compatible.runtime.spec.ts
@@ -0,0 +1,183 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest'
+import { z } from 'zod'
+
+import { AIProviderType } from '../../../../src/modules/ai/ai.types'
+import { OpenAICompatibleRuntime } from '../../../../src/modules/ai/runtime/openai-compatible.runtime'
+
+const { createMock } = vi.hoisted(() => ({
+  createMock: vi.fn(),
+}))
+
+vi.mock('openai', () => ({
+  default: class MockOpenAI {
+    chat = {
+      completions: {
+        create: createMock,
+      },
+    }
+
+    constructor(_: unknown) {}
+  },
+}))
+
+describe('OpenAICompatibleRuntime prompt caching', () => {
+  beforeEach(() => {
+    createMock.mockReset()
+  })
+
+  it('adds Vercel gateway automatic caching to text requests', async () => {
+    createMock.mockResolvedValueOnce({
+      choices: [{ message: { content: 'cached' } }],
+      usage: {
+        prompt_tokens: 10,
+        completion_tokens: 2,
+        total_tokens: 12,
+      },
+    })
+
+    const runtime = new OpenAICompatibleRuntime({
+      apiKey: 'test-key',
+      endpoint: 'https://ai-gateway.vercel.sh',
+      model: 'anthropic/claude-sonnet-4.6',
+      providerType: AIProviderType.OpenAICompatible,
+      providerId: 'vercel-gateway',
+    })
+
+    await runtime.generateText({ prompt: 'hello' })
+
+    expect(createMock).toHaveBeenCalledTimes(1)
+    expect(createMock.mock.calls[0]?.[0]).toMatchObject({
+      providerOptions: {
+        gateway: {
+          caching: 'auto',
+        },
+      },
+    })
+  })
+
+  it('does not add gateway caching to non-gateway compatible endpoints', async () => {
+    createMock.mockResolvedValueOnce({
+      choices: [{ message: { content: 'plain' } }],
+      usage: {
+        prompt_tokens: 8,
+        completion_tokens: 1,
+        total_tokens: 9,
+      },
+    })
+
+    const runtime = new OpenAICompatibleRuntime({
+      apiKey: 'test-key',
+      endpoint: 'https://api.deepseek.com',
+      model: 'deepseek-chat',
+      providerType: AIProviderType.OpenAICompatible,
+      providerId: 'deepseek',
+    })
+
+    await runtime.generateText({ prompt: 'hello' })
+
+    expect(createMock).toHaveBeenCalledTimes(1)
+    expect(createMock.mock.calls[0]?.[0]).not.toHaveProperty('providerOptions')
+  })
+
+  it('does not add gateway caching to default OpenRouter requests', async () => {
+    createMock.mockResolvedValueOnce({
+      choices: [{ message: { content: 'plain' } }],
+      usage: {
+        prompt_tokens: 8,
+        completion_tokens: 1,
+        total_tokens: 9,
+      },
+    })
+
+    const runtime = new OpenAICompatibleRuntime({
+      apiKey: 'test-key',
+      model: 'openai/gpt-4o-mini',
+      providerType: AIProviderType.OpenRouter,
+      providerId: 'openrouter',
+    })
+
+    await runtime.generateText({ prompt: 'hello' })
+
+    expect(createMock).toHaveBeenCalledTimes(1)
+    expect(createMock.mock.calls[0]?.[0]).not.toHaveProperty('providerOptions')
+  })
+
+  it('adds Vercel gateway automatic caching to structured requests', async () => {
+    createMock.mockResolvedValueOnce({
+      choices: [
+        {
+          message: {
+            tool_calls: [
+              {
+                type: 'function',
+                function: {
+                  name: 'structured_output',
+                  arguments: JSON.stringify({ answer: 'cached' }),
+                },
+              },
+            ],
+          },
+        },
+      ],
+      usage: {
+        prompt_tokens: 10,
+        completion_tokens: 2,
+        total_tokens: 12,
+      },
+    })
+
+    const runtime = new OpenAICompatibleRuntime({
+      apiKey: 'test-key',
+      endpoint: 'https://ai-gateway.vercel.sh/v1',
+      model: 'anthropic/claude-sonnet-4.6',
+      providerType: AIProviderType.OpenAICompatible,
+      providerId: 'vercel-gateway',
+    })
+
+    await runtime.generateStructured({
+      prompt: 'hello',
+      schema: z.object({ answer: z.string() }),
+    })
+
+    expect(createMock.mock.calls[0]?.[0]).toMatchObject({
+      providerOptions: {
+        gateway: {
+          caching: 'auto',
+        },
+      },
+    })
+  })
+
+  it('adds Vercel gateway automatic caching to streaming requests', async () => {
+    createMock.mockResolvedValueOnce(
+      (async function* () {
+        yield {
+          choices: [{ delta: { content: 'stream' } }],
+        }
+      })(),
+    )
+
+    const runtime = new OpenAICompatibleRuntime({
+      apiKey: 'test-key',
+      endpoint: 'https://ai-gateway.vercel.sh',
+      model: 'anthropic/claude-sonnet-4.6',
+      providerType: AIProviderType.OpenAICompatible,
+      providerId: 'vercel-gateway',
+    })
+
+    const chunks: string[] = []
+    for await (const chunk of runtime.generateTextStream({ prompt: 'hello' })) {
+      chunks.push(chunk.text)
+    }
+
+    expect(chunks).toEqual(['stream'])
+    expect(createMock.mock.calls[0]?.[0]).toMatchObject({
+      stream: true,
+      providerOptions: {
+        gateway: {
+          caching: 'auto',
+        },
+      },
+    })
+  })
+})
diff --git a/docs/superpowers/plans/2026-03-23-openai-gateway-prompt-cache-implementation.md b/docs/superpowers/plans/2026-03-23-openai-gateway-prompt-cache-implementation.md
@@ -0,0 +1,81 @@
+# OpenAI Gateway Prompt Cache Implementation Plan
+
+> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Enable Vercel AI Gateway automatic prompt caching in the OpenAI-compatible runtime without affecting non-gateway OpenAI-compatible providers.
+
+**Architecture:** Detect whether the runtime resolves to a Vercel AI Gateway base URL, then decorate chat completion requests with `providerOptions.gateway.caching = 'auto'`. Reuse the same helper across text, structured, and streaming calls so behavior stays consistent.
+
+**Tech Stack:** TypeScript, NestJS runtime layer, OpenAI SDK, Vitest, pnpm
+
+---
+
+## File Map
+
+| Action | File | Responsibility |
+|--------|------|---------------|
+| Modify | `apps/core/src/modules/ai/runtime/openai-compatible.runtime.ts` | Detect Vercel AI Gateway and decorate request params |
+| Create | `apps/core/test/src/modules/ai/openai-compatible.runtime.spec.ts` | Verify gateway-only caching injection |
+
+---
+
+## Chunk 1: Runtime Contract
+
+### Task 1: Lock gateway-only caching behavior with tests
+
+**Files:**
+- Create: `apps/core/test/src/modules/ai/openai-compatible.runtime.spec.ts`
+
+- [x] **Step 1: Write the failing test**
+
+Cover:
+- Vercel AI Gateway endpoint adds `providerOptions.gateway.caching = 'auto'`
+- Non-gateway OpenAI-compatible endpoint does not add `providerOptions`
+
+- [x] **Step 2: Run the targeted test to verify RED**
+
+Run: `pnpm test -- apps/core/test/src/modules/ai/openai-compatible.runtime.spec.ts`
+
+Expected: The gateway-specific assertion fails because the runtime does not yet decorate requests.
+
+### Task 2: Implement gateway request decoration
+
+**Files:**
+- Modify: `apps/core/src/modules/ai/runtime/openai-compatible.runtime.ts`
+
+- [x] **Step 1: Add a helper that detects Vercel AI Gateway**
+
+Use the resolved `baseURL` so detection covers normalized endpoints.
+
+- [x] **Step 2: Add a helper that decorates chat completion params**
+
+Inject:
+
+```ts
+providerOptions: {
+  gateway: {
+    caching: 'auto',
+  },
+}
+```
+
+only when the base URL points to Vercel AI Gateway.
+
+- [x] **Step 3: Reuse the helper in all request paths**
+
+Update:
+- `generateText`
+- `generateStructured`
+- `generateTextStream`
+
+- [x] **Step 4: Run the targeted test to verify GREEN**
+
+Run: `pnpm test -- apps/core/test/src/modules/ai/openai-compatible.runtime.spec.ts`
+
+Expected: Both tests pass.
+
+- [x] **Step 5: Run a small related test slice**
+
+Run: `pnpm test -- apps/core/test/src/modules/ai/ai-provider.factory.spec.ts apps/core/test/src/modules/ai/openai-compatible.runtime.spec.ts`
+
+Expected: Existing runtime factory coverage still passes.
diff --git a/docs/superpowers/specs/2026-03-23-openai-gateway-prompt-cache-design.md b/docs/superpowers/specs/2026-03-23-openai-gateway-prompt-cache-design.md