Skip to content

Commit 87e8baf

Browse files
authored
fix: prevent convo_miner from re-processing 0-chunk files on every run (#654) (#732)
* fix: register 0-chunk files to prevent re-processing on every mine (#654) mine_convos() has three early-exit paths (OSError, content too short, zero chunks) that skip writing anything to ChromaDB. Since file_already_mined() checks for the presence of a document with a matching source_file, these files are re-read and re-processed on every subsequent run. Add _register_file() that upserts a lightweight sentinel document (room="_registry", ingest_mode="registry") so file_already_mined() returns True on future runs. Note: Bug 2 from the issue (drawers_added counter always 0) was already resolved upstream via the switch from collection.add() to collection.upsert(). * fix: resolve macOS path symlink in test + remove unused variable
1 parent 9b60c6e commit 87e8baf

2 files changed

Lines changed: 81 additions & 0 deletions

File tree

mempalace/convo_miner.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,30 @@
3232
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
3333

3434

35+
def _register_file(collection, source_file: str, wing: str, agent: str):
36+
"""Write a sentinel so file_already_mined() returns True for 0-chunk files.
37+
38+
Without this, files that normalize to nothing or produce zero chunks are
39+
re-read and re-processed on every mine run because nothing was written to
40+
ChromaDB on the first pass.
41+
"""
42+
sentinel_id = f"_reg_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
43+
collection.upsert(
44+
documents=[f"[registry] {source_file}"],
45+
ids=[sentinel_id],
46+
metadatas=[
47+
{
48+
"wing": wing,
49+
"room": "_registry",
50+
"source_file": source_file,
51+
"added_by": agent,
52+
"filed_at": datetime.now().isoformat(),
53+
"ingest_mode": "registry",
54+
}
55+
],
56+
)
57+
58+
3559
# =============================================================================
3660
# CHUNKING — exchange pairs for conversations
3761
# =============================================================================
@@ -305,9 +329,13 @@ def mine_convos(
305329
try:
306330
content = normalize(str(filepath))
307331
except (OSError, ValueError):
332+
if not dry_run:
333+
_register_file(collection, source_file, wing, agent)
308334
continue
309335

310336
if not content or len(content.strip()) < MIN_CHUNK_SIZE:
337+
if not dry_run:
338+
_register_file(collection, source_file, wing, agent)
311339
continue
312340

313341
# Chunk — either exchange pairs or general extraction
@@ -320,6 +348,8 @@ def mine_convos(
320348
chunks = chunk_exchanges(content)
321349

322350
if not chunks:
351+
if not dry_run:
352+
_register_file(collection, source_file, wing, agent)
323353
continue
324354

325355
# Detect room from content (general mode uses memory_type instead)

tests/test_convo_miner.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
import os
22
import tempfile
33
import shutil
4+
from pathlib import Path
5+
46
import chromadb
7+
58
from mempalace.convo_miner import mine_convos
9+
from mempalace.palace import file_already_mined
610

711

812
def test_convo_mining():
@@ -24,3 +28,50 @@ def test_convo_mining():
2428
assert len(results["documents"][0]) > 0
2529

2630
shutil.rmtree(tmpdir, ignore_errors=True)
31+
32+
33+
def test_mine_convos_does_not_reprocess_short_files(capsys):
34+
"""Files below MIN_CHUNK_SIZE get a sentinel so they are skipped on re-run."""
35+
tmpdir = tempfile.mkdtemp()
36+
try:
37+
# A file too short to produce any chunks
38+
with open(os.path.join(tmpdir, "tiny.txt"), "w") as f:
39+
f.write("hi")
40+
41+
palace_path = os.path.join(tmpdir, "palace")
42+
43+
# First run -- file is processed (sentinel written)
44+
mine_convos(tmpdir, palace_path, wing="test")
45+
capsys.readouterr() # drain output
46+
47+
# Verify sentinel was written (resolve path -- macOS /var -> /private/var)
48+
resolved_file = str(Path(tmpdir).resolve() / "tiny.txt")
49+
client = chromadb.PersistentClient(path=palace_path)
50+
col = client.get_collection("mempalace_drawers")
51+
assert file_already_mined(col, resolved_file)
52+
53+
# Second run -- file should be skipped
54+
mine_convos(tmpdir, palace_path, wing="test")
55+
out2 = capsys.readouterr().out
56+
assert "Files skipped (already filed): 1" in out2
57+
finally:
58+
shutil.rmtree(tmpdir, ignore_errors=True)
59+
60+
61+
def test_mine_convos_does_not_reprocess_empty_chunk_files(capsys):
62+
"""Files that normalize but produce 0 exchange chunks get a sentinel."""
63+
tmpdir = tempfile.mkdtemp()
64+
try:
65+
# Content long enough to pass MIN_CHUNK_SIZE but with no exchange markers
66+
# (no "> " lines), so chunk_exchanges returns []
67+
with open(os.path.join(tmpdir, "no_exchanges.txt"), "w") as f:
68+
f.write("This is a plain paragraph without any exchange markers. " * 5)
69+
70+
palace_path = os.path.join(tmpdir, "palace")
71+
72+
mine_convos(tmpdir, palace_path, wing="test")
73+
mine_convos(tmpdir, palace_path, wing="test")
74+
out2 = capsys.readouterr().out
75+
assert "Files skipped (already filed): 1" in out2
76+
finally:
77+
shutil.rmtree(tmpdir, ignore_errors=True)

0 commit comments

Comments
 (0)