|
39 | 39 |
|
40 | 40 | # ── Config (env vars override CLI defaults) ─────────────────────────────────── |
41 | 41 |
|
42 | | -VERSION = "1.5.0" |
| 42 | +VERSION = "1.5.1" |
43 | 43 | DEFAULT_HOST = os.getenv("PALACE_HOST", "0.0.0.0") |
44 | 44 | DEFAULT_PORT = int(os.getenv("PALACE_PORT", "8085")) |
45 | 45 | DEFAULT_PALACE = os.getenv("PALACE_PATH", "") |
|
67 | 67 | _log = logging.getLogger("palace-daemon") |
68 | 68 |
|
69 | 69 |
|
| 70 | +# ── Systemd watchdog / sd_notify ───────────────────────────────────────────── |
| 71 | + |
| 72 | +def _sd_notify(msg: str) -> None: |
| 73 | + """Send a message to systemd notify socket without external dependencies.""" |
| 74 | + sock_path = os.environ.get("NOTIFY_SOCKET", "") |
| 75 | + if not sock_path: |
| 76 | + return |
| 77 | + try: |
| 78 | + import socket as _sock |
| 79 | + with _sock.socket(_sock.AF_UNIX, _sock.SOCK_DGRAM) as s: |
| 80 | + # Abstract namespace sockets use NUL prefix; systemd uses @ prefix. |
| 81 | + addr = chr(0) + sock_path[1:] if sock_path.startswith("@") else sock_path |
| 82 | + s.sendto(msg.encode(), addr) |
| 83 | + except Exception: |
| 84 | + pass |
| 85 | + |
| 86 | + |
| 87 | +def _watchdog_interval() -> int: |
| 88 | + """Return WatchdogSec in seconds from WATCHDOG_USEC (set by systemd), or 0.""" |
| 89 | + try: |
| 90 | + return int(os.environ.get("WATCHDOG_USEC", "0")) // 1_000_000 |
| 91 | + except ValueError: |
| 92 | + return 0 |
| 93 | + |
| 94 | + |
| 95 | +async def _watchdog_loop(interval_secs: int) -> None: |
| 96 | + """Ping systemd watchdog at half the watchdog interval, only when palace is healthy.""" |
| 97 | + tick = max(10, interval_secs // 2) |
| 98 | + while True: |
| 99 | + await asyncio.sleep(tick) |
| 100 | + try: |
| 101 | + loop = asyncio.get_running_loop() |
| 102 | + col = await loop.run_in_executor(None, _mp._get_collection) |
| 103 | + if col is not None: |
| 104 | + _sd_notify("WATCHDOG=1\n") |
| 105 | + else: |
| 106 | + _log.warning("Watchdog: palace collection unavailable — skipping WATCHDOG=1") |
| 107 | + except Exception as e: |
| 108 | + _log.warning("Watchdog check failed: %s", e) |
| 109 | + |
| 110 | + |
70 | 111 | async def _warn_if_hnsw_threads_unset() -> None: |
71 | 112 | """Warn if hnsw:num_threads != 1 after a collection reopen. |
72 | 113 |
|
@@ -321,16 +362,24 @@ async def lifespan(app: FastAPI): |
321 | 362 | # Warm the ChromaDB client before accepting traffic. The Rust HNSW binding |
322 | 363 | # occasionally segfaults on the very first request if opened cold; opening |
323 | 364 | # it here (before yield) ensures the PersistentClient is fully initialized. |
| 365 | + # We open the collection directly (not via ping) so that _get_collection's |
| 366 | + # hnsw:num_threads=1 fix is applied before _warn_if_hnsw_threads_unset runs. |
324 | 367 | try: |
325 | 368 | loop = asyncio.get_running_loop() |
326 | | - await loop.run_in_executor(None, _mp.handle_request, { |
327 | | - "jsonrpc": "2.0", "id": "warmup", "method": "ping", "params": {} |
328 | | - }) |
| 369 | + await loop.run_in_executor(None, _mp._get_collection, True) |
329 | 370 | logger.info("Palace client warmed up.") |
330 | 371 | except Exception as e: |
331 | | - logger.warning("Warmup ping failed (non-fatal): %s", e) |
| 372 | + logger.warning("Warmup collection open failed (non-fatal): %s", e) |
332 | 373 | await _warn_if_hnsw_threads_unset() |
333 | 374 |
|
| 375 | + # Signal systemd that startup is complete (Type=notify in service file). |
| 376 | + _sd_notify("READY=1\n") |
| 377 | + |
| 378 | + # Start systemd watchdog loop if WatchdogSec is configured. |
| 379 | + wdog_secs = _watchdog_interval() |
| 380 | + if wdog_secs > 0: |
| 381 | + asyncio.create_task(_watchdog_loop(wdog_secs)) |
| 382 | + logger.info("Systemd watchdog active (interval=%ds, tick=%ds).", wdog_secs, max(10, wdog_secs // 2)) |
334 | 383 |
|
335 | 384 | yield |
336 | 385 |
|
@@ -371,7 +420,18 @@ async def health(): |
371 | 420 | # Bypass semaphores — health must respond even when all slots are busy. |
372 | 421 | loop = asyncio.get_running_loop() |
373 | 422 | result = await loop.run_in_executor(None, _mp.handle_request, {"jsonrpc": "2.0", "id": 1, "method": "ping", "params": {}}) or {} |
374 | | - return {"status": "ok", "daemon": "palace-daemon", "version": VERSION, "palace": result} |
| 423 | + # Test actual collection access so /health reflects true palace state. |
| 424 | + palace_ok = False |
| 425 | + try: |
| 426 | + col = await loop.run_in_executor(None, _mp._get_collection) |
| 427 | + palace_ok = col is not None |
| 428 | + except Exception: |
| 429 | + pass |
| 430 | + status = "ok" if palace_ok else "degraded" |
| 431 | + payload = {"status": status, "daemon": "palace-daemon", "version": VERSION, "palace": result} |
| 432 | + if not palace_ok: |
| 433 | + return JSONResponse(content=payload, status_code=503) |
| 434 | + return payload |
375 | 435 |
|
376 | 436 |
|
377 | 437 | @app.get("/search") |
|
0 commit comments