Skip to content

Commit 88a53b2

Browse files
fungufrobot-rocket-science
authored andcommitted
fix: prevent HNSW index bloat via batch_size + sync_threshold metadata
Sets `hnsw:batch_size` and `hnsw:sync_threshold` to 50_000 at every collection-creation call site: * `mempalace/backends/chroma.py` — `get_collection(create=True)` and the legacy `create_collection()` path. Preserves existing `hnsw:space`, `hnsw:num_threads=1` (race fix from #976), and `**ef_kwargs` (embedding-function plumbing from a4868a3). * `mempalace/mcp_server.py` — the direct `client.get_or_create_collection` path used when a palace is first opened by the MCP server. Without this third site, MCP-bootstrapped palaces would skip the guard and could still trigger the original bloat. Without these defaults, mining ~10K+ drawers triggers ~30 HNSW index resizes and hundreds of persistDirty() calls. persistDirty uses relative seek positioning in link_lists.bin; accumulated seek drift across resize cycles causes the OS to extend the sparse file with zero-filled regions, each cycle compounding the next. Result: link_lists.bin grows into hundreds of GB sparse, after which `status`, `search`, and `repair` all segfault and the palace is unrecoverable. Empirical: rebuilt a palace from scratch on 39,792 drawers across 5 wings with this fix applied. Final palace 376 MB, link_lists.bin stays at 0 bytes across both Chroma collection dirs, status and search both return cleanly. Same workload without the fix bloated the palace to 565 GB sparse (30 GB on disk) and segfaulted at ~15K drawers. Migration note: chromadb 1.5.x exposes a `collection.modify(configuration={"hnsw": {...}})` retrofit path for already-created collections (`UpdateHNSWConfiguration`), but this PR doesn't pursue it — by the time link_lists.bin has bloated the index is already corrupt and the only known recovery is a fresh mine. Tests assert both keys land on the persisted collection metadata in both `ChromaBackend` code paths, which also covers the #1161 "config silently dropped" concern at CI time. A separate smoke test was used to verify the metadata round-trips through `chromadb.PersistentClient` reopen on chromadb 1.5.8. Closes #344 Supersedes #346 Co-authored-by: robot-rocket-science <[email protected]>
1 parent bc5d3fa commit 88a53b2

3 files changed

Lines changed: 77 additions & 3 deletions

File tree

mempalace/backends/chroma.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,31 @@
2828
_OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
2929
_SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
3030

31+
# HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
32+
#
33+
# With default params (batch_size=100, sync_threshold=1000, initial capacity
34+
# 1000), inserting tens of thousands of drawers triggers ~30 index resizes
35+
# and hundreds of persistDirty() calls. persistDirty uses relative seek
36+
# positioning in link_lists.bin; accumulated seek drift across resize cycles
37+
# causes the OS to extend the sparse file with zero-filled regions, each
38+
# cycle compounding the next. Result: link_lists.bin grows into hundreds of
39+
# GB sparse, after which `status`/`search`/`repair` segfault.
40+
#
41+
# Setting large batch and sync thresholds at collection creation defers
42+
# persistence until a single large batch completes, breaking the resize+
43+
# persist feedback loop. Empirically validated on a 39,792-drawer rebuild
44+
# (palace 376 MB, link_lists.bin 0 bytes, no segfault) in 2026-04.
45+
#
46+
# Note: chromadb 1.5.x exposes a `collection.modify(configuration={"hnsw":
47+
# {"batch_size": ..., "sync_threshold": ...}})` retrofit path for already-
48+
# created collections (`UpdateHNSWConfiguration` in chromadb's API), but
49+
# this PR doesn't pursue that — once link_lists.bin has bloated, the index
50+
# is already corrupt and the only known recovery is a fresh mine.
51+
_HNSW_BLOAT_GUARD = {
52+
"hnsw:batch_size": 50_000,
53+
"hnsw:sync_threshold": 50_000,
54+
}
55+
3156

3257
def _validate_where(where: Optional[dict]) -> None:
3358
"""Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
@@ -992,7 +1017,11 @@ def get_collection(
9921017
if create:
9931018
collection = client.get_or_create_collection(
9941019
collection_name,
995-
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
1020+
metadata={
1021+
"hnsw:space": hnsw_space,
1022+
"hnsw:num_threads": 1,
1023+
**_HNSW_BLOAT_GUARD,
1024+
},
9961025
**ef_kwargs,
9971026
)
9981027
else:
@@ -1042,7 +1071,11 @@ def create_collection(
10421071
ef_kwargs = {"embedding_function": ef} if ef is not None else {}
10431072
collection = self._client(palace_path).create_collection(
10441073
collection_name,
1045-
metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
1074+
metadata={
1075+
"hnsw:space": hnsw_space,
1076+
"hnsw:num_threads": 1,
1077+
**_HNSW_BLOAT_GUARD,
1078+
},
10461079
**ef_kwargs,
10471080
)
10481081
return ChromaCollection(collection)

mempalace/mcp_server.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
from .backends.chroma import ( # noqa: E402
6161
ChromaBackend,
6262
ChromaCollection,
63+
_HNSW_BLOAT_GUARD,
6364
_pin_hnsw_threads,
6465
hnsw_capacity_status,
6566
)
@@ -285,7 +286,11 @@ def _get_collection(create=False):
285286
# so the retrofit runs every time _get_collection opens a cache).
286287
raw = client.get_or_create_collection(
287288
_config.collection_name,
288-
metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1},
289+
metadata={
290+
"hnsw:space": "cosine",
291+
"hnsw:num_threads": 1,
292+
**_HNSW_BLOAT_GUARD,
293+
},
289294
)
290295
_pin_hnsw_threads(raw)
291296
_collection_cache = ChromaCollection(raw)

tests/test_backends.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,42 @@ def test_chroma_backend_creates_collection_with_cosine_distance(tmp_path):
336336
assert col.metadata.get("hnsw:space") == "cosine"
337337

338338

339+
def test_chroma_backend_sets_hnsw_bloat_guard_on_creation(tmp_path):
340+
"""The HNSW guard from #344 must land on freshly-created collection metadata.
341+
342+
Without batch_size + sync_threshold, mining ~10K+ drawers triggers the
343+
resize+persist drift that bloats link_lists.bin into hundreds of GB sparse
344+
and segfaults `status` / `search` / `repair`. The guard belongs at
345+
collection-creation time so every fresh palace gets it without needing
346+
a runtime retrofit. Asserting both keys land on the persisted metadata
347+
also covers the #1161 "config silently dropped" concern at CI time.
348+
"""
349+
palace_path = tmp_path / "palace"
350+
351+
ChromaBackend().get_collection(
352+
str(palace_path),
353+
collection_name="mempalace_drawers",
354+
create=True,
355+
)
356+
357+
client = chromadb.PersistentClient(path=str(palace_path))
358+
col = client.get_collection("mempalace_drawers")
359+
assert col.metadata.get("hnsw:batch_size") == 50_000
360+
assert col.metadata.get("hnsw:sync_threshold") == 50_000
361+
362+
363+
def test_chroma_backend_create_collection_sets_hnsw_bloat_guard(tmp_path):
364+
"""Same guard must apply via the legacy create_collection() path."""
365+
palace_path = tmp_path / "palace"
366+
367+
ChromaBackend().create_collection(str(palace_path), "mempalace_drawers")
368+
369+
client = chromadb.PersistentClient(path=str(palace_path))
370+
col = client.get_collection("mempalace_drawers")
371+
assert col.metadata.get("hnsw:batch_size") == 50_000
372+
assert col.metadata.get("hnsw:sync_threshold") == 50_000
373+
374+
339375
def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
340376
"""Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
341377
db_path = tmp_path / "chroma.sqlite3"

0 commit comments

Comments
 (0)