fix(memory): serialize local embedding initialization to avoid duplicate model loads

SubtleSpark · claude · SubtleSpark · commit 2bbe8960896c · 2026-02-14T00:43:27.000+08:00
Concurrent calls to ensureContext() during file-level parallel indexing
(EMBEDDING_INDEX_CONCURRENCY=4) could each pass the `if (!llama)` check
before the first await resolved, causing the model to be loaded multiple
times into VRAM. This exhausted GPU memory and made local embeddings
unusable for users with 2+ memory files.

Guard initialization with a cached Promise so all concurrent callers
share a single init sequence.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/memory/embeddings.test.ts b/src/memory/embeddings.test.ts
@@ -480,3 +480,75 @@ describe("local embedding normalization", () => {
     }
   });
 });
+
+describe("local embedding ensureContext concurrency", () => {
+  afterEach(() => {
+    vi.resetAllMocks();
+    vi.resetModules();
+    vi.unstubAllGlobals();
+    vi.doUnmock("./node-llama.js");
+  });
+
+  it("loads the model only once when embedBatch is called concurrently", async () => {
+    const getLlamaSpy = vi.fn();
+    const loadModelSpy = vi.fn();
+    const createContextSpy = vi.fn();
+
+    vi.doMock("./node-llama.js", () => ({
+      importNodeLlamaCpp: async () => ({
+        getLlama: async (...args: unknown[]) => {
+          getLlamaSpy(...args);
+          // Simulate real async delay so concurrent callers can interleave
+          await new Promise((r) => setTimeout(r, 50));
+          return {
+            loadModel: async (...modelArgs: unknown[]) => {
+              loadModelSpy(...modelArgs);
+              await new Promise((r) => setTimeout(r, 50));
+              return {
+                createEmbeddingContext: async () => {
+                  createContextSpy();
+                  return {
+                    getEmbeddingFor: vi.fn().mockResolvedValue({
+                      vector: new Float32Array([1, 0, 0, 0]),
+                    }),
+                  };
+                },
+              };
+            },
+          };
+        },
+        resolveModelFile: async () => "/fake/model.gguf",
+        LlamaLogLevel: { error: 0 },
+      }),
+    }));
+
+    const { createEmbeddingProvider } = await import("./embeddings.js");
+
+    const result = await createEmbeddingProvider({
+      config: {} as never,
+      provider: "local",
+      model: "",
+      fallback: "none",
+    });
+
+    // Launch 4 concurrent embedBatch calls (simulates EMBEDDING_INDEX_CONCURRENCY = 4)
+    const results = await Promise.all([
+      result.provider.embedBatch(["text1"]),
+      result.provider.embedBatch(["text2"]),
+      result.provider.embedBatch(["text3"]),
+      result.provider.embedBatch(["text4"]),
+    ]);
+
+    // All calls should return valid embeddings
+    expect(results).toHaveLength(4);
+    for (const embeddings of results) {
+      expect(embeddings).toHaveLength(1);
+      expect(embeddings[0]).toHaveLength(4);
+    }
+
+    // The model should only be loaded once despite 4 concurrent calls
+    expect(getLlamaSpy).toHaveBeenCalledTimes(1);
+    expect(loadModelSpy).toHaveBeenCalledTimes(1);
+    expect(createContextSpy).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/src/memory/embeddings.ts b/src/memory/embeddings.ts
@@ -91,19 +91,29 @@ async function createLocalEmbeddingProvider(
   let llama: Llama | null = null;
   let embeddingModel: LlamaModel | null = null;
   let embeddingContext: LlamaEmbeddingContext | null = null;
+  let initPromise: Promise<LlamaEmbeddingContext> | null = null;
 
-  const ensureContext = async () => {
-    if (!llama) {
-      llama = await getLlama({ logLevel: LlamaLogLevel.error });
+  const ensureContext = async (): Promise<LlamaEmbeddingContext> => {
+    if (embeddingContext) {
+      return embeddingContext;
     }
-    if (!embeddingModel) {
-      const resolved = await resolveModelFile(modelPath, modelCacheDir || undefined);
-      embeddingModel = await llama.loadModel({ modelPath: resolved });
+    if (initPromise) {
+      return initPromise;
     }
-    if (!embeddingContext) {
-      embeddingContext = await embeddingModel.createEmbeddingContext();
-    }
-    return embeddingContext;
+    initPromise = (async () => {
+      if (!llama) {
+        llama = await getLlama({ logLevel: LlamaLogLevel.error });
+      }
+      if (!embeddingModel) {
+        const resolved = await resolveModelFile(modelPath, modelCacheDir || undefined);
+        embeddingModel = await llama.loadModel({ modelPath: resolved });
+      }
+      if (!embeddingContext) {
+        embeddingContext = await embeddingModel.createEmbeddingContext();
+      }
+      return embeddingContext;
+    })();
+    return initPromise;
   };
 
   return {