feat(convo_miner): parallel file processing + batched upserts

FabioLissi · FabioLissi · commit fe74a7daeb43 · 2026-04-09T18:42:56.000-05:00
Port PR MemPalace#416's parallelization pattern from miner.py to convo_miner.py. PR MemPalace#416 only batched project mining (mempalace mine <dir>), leaving conversation mining (mempalace mine <dir> --mode convos) on the slow per-chunk collection.add() path. Since convo mining is the primary ingest path for large corpora like ~/.claude/projects/ (~1.5 GB of Claude Code JSONL), this omission negated most of the speedup potential from both PR MemPalace#416 (batching) and PR MemPalace#442 (GPU embedding). Changes: 1. Extract process_convo_file_cpu() — pure CPU worker that normalizes, chunks, detects room, and builds drawer records. Thread-safe by construction: no ChromaDB calls, no shared state, all inputs passed explicitly. Returns (source_file, room, records, room_counts_delta) or None for skipped files. 2. Rewrite mine_convos() ingest loop: - Pre-filter pending files with file_already_mined() (sequential, matches PR MemPalace#416's pattern) - Submit all pending files to ThreadPoolExecutor with MAX_WORKERS = min(32, cpu_count()*2) for parallel normalize/chunk - Main thread accumulates records into batch_ids/docs/metas lists and flushes via collection.upsert() every BATCH_SIZE (128) records - try/finally around the executor guarantees final flush_batch() runs even if a worker raises, preventing silent loss of up to BATCH_SIZE-1 pending drawers - Per-worker exceptions are caught and logged instead of aborting the whole run (each file is independent) 3. Keep dry_run path sequential — matches miner.py, preserves original output formatting (per-file [DRY RUN] lines, room distribution), uses the same extracted worker for consistency. 4. Switch collection.add() -> collection.upsert() — idempotent, removes the try/except 'already exists' dance, matches miner.py. Performance expectations (M-series Mac with MEMPALACE_EMBEDDING_DEVICE=mps): Before (single-file sequential loop + per-chunk .add()): 502 drawers in 12.3s = 40.7 drawers/s After (parallel reads + batched upserts): Expected ~3-5x improvement from batching alone (GPU finally gets meaningful batch sizes), plus another ~2-3x from parallelizing the normalize() step on large JSONL files. Combined: ~5-15x. All 556 tests still pass, including tests/test_convo_miner.py which exercises the real ChromaDB write path end-to-end. No changes required to the public API. Callers (cli.py, mcp_server.py) are unaffected.
diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from .normalize import normalize
 from .palace import SKIP_DIRS, get_collection, file_already_mined
@@ -28,6 +29,8 @@
 }
 
 MIN_CHUNK_SIZE = 30
+BATCH_SIZE = 128  # chunks per upsert call (matches miner.py)
+MAX_WORKERS = min(32, (os.cpu_count() or 4) * 2)  # parallel file readers
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB — skip files larger than this
 
 
@@ -229,6 +232,87 @@ def scan_convos(convo_dir: str) -> list:
 # =============================================================================
 
 
+def process_convo_file_cpu(
+    filepath: Path,
+    wing: str,
+    agent: str,
+    extract_mode: str,
+) -> tuple:
+    """
+    Pure CPU worker: normalize, chunk, detect room, build drawer records.
+    Thread-safe — no ChromaDB calls, no shared state.
+
+    Returns (source_file, room, records, room_counts_delta) or None if the file
+    should be skipped (empty, too small, failed to normalize).
+
+    - source_file: absolute path string
+    - room: detected room name (or None for extract_mode='general')
+    - records: list of (drawer_id, content, metadata) tuples ready for upsert
+    - room_counts_delta: dict of {room: count} contributed by this file
+      (for general mode this holds per-chunk memory_type counts; for exchange
+      mode it holds {room: 1})
+    """
+    source_file = str(filepath)
+
+    # Normalize format (may raise — caller catches)
+    try:
+        content = normalize(source_file)
+    except (OSError, ValueError):
+        return None
+
+    if not content or len(content.strip()) < MIN_CHUNK_SIZE:
+        return None
+
+    # Chunk — either exchange pairs or general extraction
+    if extract_mode == "general":
+        from .general_extractor import extract_memories
+
+        chunks = extract_memories(content)
+    else:
+        chunks = chunk_exchanges(content)
+
+    if not chunks:
+        return None
+
+    # Detect room from content (general mode uses memory_type instead)
+    if extract_mode != "general":
+        room = detect_convo_room(content)
+    else:
+        room = None  # set per-chunk below
+
+    now = datetime.now().isoformat()
+    records = []
+    room_counts_delta = defaultdict(int)
+
+    if extract_mode != "general":
+        room_counts_delta[room] = 1
+
+    for chunk in chunks:
+        chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
+        if extract_mode == "general":
+            room_counts_delta[chunk_room] += 1
+
+        drawer_id = (
+            f"drawer_{wing}_{chunk_room}_"
+            + hashlib.sha256(
+                (source_file + str(chunk["chunk_index"])).encode()
+            ).hexdigest()[:24]
+        )
+        meta = {
+            "wing": wing,
+            "room": chunk_room,
+            "source_file": source_file,
+            "chunk_index": chunk["chunk_index"],
+            "added_by": agent,
+            "filed_at": now,
+            "ingest_mode": "convos",
+            "extract_mode": extract_mode,
+        }
+        records.append((drawer_id, chunk["content"], meta))
+
+    return source_file, room, records, dict(room_counts_delta)
+
+
 def mine_convos(
     convo_dir: str,
     palace_path: str,
@@ -270,93 +354,90 @@ def mine_convos(
     files_skipped = 0
     room_counts = defaultdict(int)
 
-    for i, filepath in enumerate(files, 1):
-        source_file = str(filepath)
-
-        # Skip if already filed
-        if not dry_run and file_already_mined(collection, source_file):
-            files_skipped += 1
-            continue
-
-        # Normalize format
-        try:
-            content = normalize(str(filepath))
-        except (OSError, ValueError):
-            continue
-
-        if not content or len(content.strip()) < MIN_CHUNK_SIZE:
-            continue
-
-        # Chunk — either exchange pairs or general extraction
-        if extract_mode == "general":
-            from .general_extractor import extract_memories
-
-            chunks = extract_memories(content)
-            # Each chunk already has memory_type; use it as the room name
-        else:
-            chunks = chunk_exchanges(content)
-
-        if not chunks:
-            continue
-
-        # Detect room from content (general mode uses memory_type instead)
-        if extract_mode != "general":
-            room = detect_convo_room(content)
-        else:
-            room = None  # set per-chunk below
-
-        if dry_run:
+    # --------------------------------------------------------------
+    # DRY RUN: sequential, no writes, preserves original output
+    # --------------------------------------------------------------
+    if dry_run:
+        for i, filepath in enumerate(files, 1):
+            result = process_convo_file_cpu(filepath, wing, agent, extract_mode)
+            if result is None:
+                continue
+            _, room, records, room_counts_delta = result
             if extract_mode == "general":
                 from collections import Counter
 
-                type_counts = Counter(c.get("memory_type", "general") for c in chunks)
+                type_counts = Counter(meta["room"] for (_, _, meta) in records)
                 types_str = ", ".join(f"{t}:{n}" for t, n in type_counts.most_common())
-                print(f"    [DRY RUN] {filepath.name} → {len(chunks)} memories ({types_str})")
+                print(f"    [DRY RUN] {filepath.name} → {len(records)} memories ({types_str})")
             else:
-                print(f"    [DRY RUN] {filepath.name} → room:{room} ({len(chunks)} drawers)")
-            total_drawers += len(chunks)
-            # Track room counts
-            if extract_mode == "general":
-                for c in chunks:
-                    room_counts[c.get("memory_type", "general")] += 1
-            else:
-                room_counts[room] += 1
-            continue
-
-        if extract_mode != "general":
-            room_counts[room] += 1
-
-        # File each chunk
-        drawers_added = 0
-        for chunk in chunks:
-            chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
-            if extract_mode == "general":
-                room_counts[chunk_room] += 1
-            drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
-            try:
-                collection.add(
-                    documents=[chunk["content"]],
-                    ids=[drawer_id],
-                    metadatas=[
-                        {
-                            "wing": wing,
-                            "room": chunk_room,
-                            "source_file": source_file,
-                            "chunk_index": chunk["chunk_index"],
-                            "added_by": agent,
-                            "filed_at": datetime.now().isoformat(),
-                            "ingest_mode": "convos",
-                            "extract_mode": extract_mode,
-                        }
-                    ],
+                print(f"    [DRY RUN] {filepath.name} → room:{room} ({len(records)} drawers)")
+            total_drawers += len(records)
+            for r, c in room_counts_delta.items():
+                room_counts[r] += c
+
+    # --------------------------------------------------------------
+    # REAL MINE: parallel file processing + batched upserts
+    # --------------------------------------------------------------
+    else:
+        print(f"  Checking {len(files)} files for changes...")
+        pending = [fp for fp in files if not file_already_mined(collection, str(fp))]
+        already_mined = len(files) - len(pending)
+
+        batch_ids, batch_docs, batch_metas = [], [], []
+        completed = 0
+        skipped_small = 0
+
+        def flush_batch():
+            if batch_ids:
+                collection.upsert(
+                    documents=batch_docs,
+                    ids=batch_ids,
+                    metadatas=batch_metas,
                 )
-                drawers_added += 1
-            except Exception as e:
-                if "already exists" not in str(e).lower():
-                    raise
+                batch_ids.clear()
+                batch_docs.clear()
+                batch_metas.clear()
 
-        total_drawers += drawers_added
-        print(f"  ✓ [{i:4}/{len(files)}] {filepath.name[:50]:50} +{drawers_added}")
+        # try/finally guarantees flush_batch() runs even if a worker raises,
+        # preventing silent loss of up to BATCH_SIZE-1 pending drawers.
+        try:
+            with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+                futures = {
+                    executor.submit(
+                        process_convo_file_cpu, fp, wing, agent, extract_mode
+                    ): fp
+                    for fp in pending
+                }
+                for future in as_completed(futures):
+                    filepath = futures[future]
+                    try:
+                        result = future.result()
+                    except Exception as e:
+                        print(f"  ! [ERROR] {filepath.name}: {e}")
+                        completed += 1
+                        continue
+                    completed += 1
+                    if result is None:
+                        skipped_small += 1
+                        continue
+                    source_file, room, records, room_counts_delta = result
+                    for drawer_id, chunk_content, meta in records:
+                        batch_ids.append(drawer_id)
+                        batch_docs.append(chunk_content)
+                        batch_metas.append(meta)
+                        if len(batch_ids) >= BATCH_SIZE:
+                            flush_batch()
+                    total_drawers += len(records)
+                    for r, c in room_counts_delta.items():
+                        room_counts[r] += c
+                    print(
+                        f"  ✓ [{completed:4}/{len(pending)}] "
+                        f"{Path(source_file).name[:50]:50} +{len(records)}"
+                    )
+        finally:
+            flush_batch()
+
+        files_skipped = already_mined + skipped_small
 
     print(f"\n{'=' * 55}")
     print("  Done.")