Tracer-Cloud
diff --git a/‎Makefile‎
Lines changed: 6 additions & 1 deletion b/‎Makefile‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎app/agent/__init__.py‎
Lines changed: 12 additions & 4 deletions b/‎app/agent/__init__.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎app/agent/nodes/plan_actions/detect_sources.py‎
Lines changed: 13 additions & 4 deletions b/‎app/agent/nodes/plan_actions/detect_sources.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎app/agent/nodes/plan_actions/extract_keywords.py‎
Lines changed: 7 additions & 0 deletions b/‎app/agent/nodes/plan_actions/extract_keywords.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎app/agent/nodes/publish_findings/node.py‎
Lines changed: 1 addition & 1 deletion b/‎app/agent/nodes/publish_findings/node.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/agent/nodes/publish_findings/renderers/terminal.py‎
Lines changed: 16 additions & 9 deletions b/‎app/agent/nodes/publish_findings/renderers/terminal.py‎
Lines changed: 16 additions & 9 deletions
diff --git a/‎app/agent/nodes/root_cause_diagnosis/claim_validator.py‎
Lines changed: 45 additions & 0 deletions b/‎app/agent/nodes/root_cause_diagnosis/claim_validator.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎app/agent/nodes/root_cause_diagnosis/evidence_checker.py‎
Lines changed: 74 additions & 0 deletions b/‎app/agent/nodes/root_cause_diagnosis/evidence_checker.py‎
Lines changed: 74 additions & 0 deletions
@@ -1,7 +1,7 @@
 -include .env
 export
 
-.PHONY: install install-hooks onboard test test-full demo local-rca-demo alert-template investigate-alert verify-integrations check-docker check-langgraph check-langsmith-api-key grafana-local-up grafana-local-down grafana-local-seed local-grafana-live langgraph-build langgraph-deploy clean lint format build deploy deploy-lambda deploy-prefect deploy-flink destroy destroy-lambda destroy-prefect destroy-flink prefect-local-test simulate-k8s-alert test-k8s-local test-k8s test-k8s-datadog deploy-dd-monitors cleanup-dd-monitors deploy-eks destroy-eks test-k8s-eks datadog-demo crashloop-demo regen-trigger-config test-rca test-rca-grafana
+.PHONY: install install-hooks onboard test test-full demo local-rca-demo alert-template investigate-alert verify-integrations check-docker check-langgraph check-langsmith-api-key grafana-local-up grafana-local-down grafana-local-seed local-grafana-live langgraph-build langgraph-deploy clean lint format deploy deploy-lambda deploy-prefect deploy-flink destroy destroy-lambda destroy-prefect destroy-flink prefect-local-test simulate-k8s-alert test-k8s-local test-k8s test-k8s-datadog deploy-dd-monitors cleanup-dd-monitors deploy-eks destroy-eks test-k8s-eks datadog-demo crashloop-demo regen-trigger-config test-rca test-rca-grafana test-rds-synthetic
 
 ifneq ($(wildcard .venv/bin/python),)
 PYTHON = .venv/bin/python
@@ -96,6 +96,10 @@ prefect-demo:
 test-rca:
 	$(PYTHON) -m tests.rca.run_rca_test $(FILE)
 
+# Run synthetic RDS PostgreSQL RCA benchmark suite
+test-rds-synthetic:
+	$(PYTHON) -m tests.synthetic_testing.rds_postgres.run_suite $(if $(SCENARIO),--scenario $(SCENARIO),)
+
 # Boot local Grafana+Loki, seed deterministic test logs, then run the RCA pipeline
 # Requires GRAFANA_INSTANCE_URL + GRAFANA_READ_TOKEN in .env (see .env.example for local defaults)
 test-rca-grafana: grafana-local-up grafana-local-seed
@@ -324,6 +328,7 @@ help:
 	@echo "  make test-grafana    - Run Grafana integration tests"
 	@echo "  make test-rca        - Run all RCA markdown alert tests in tests/rca/"
 	@echo "  make test-rca FILE=pipeline_error_in_logs - Run a single RCA alert test"
+	@echo "  make test-rds-synthetic - Run the synthetic RDS PostgreSQL RCA suite"
 	@echo "  make clean           - Clean up cache files"
 	@echo "  make lint            - Lint code with ruff"
 	@echo "  make format          - Format code with ruff"
 
@@ -1,7 +1,15 @@
 """Agent core - LangGraph investigation and report generation."""
 
-from app.agent.runners import run_investigation
+from __future__ import annotations
 
-__all__ = [
-    "run_investigation",
-]
+from typing import Any
+
+
+def run_investigation(*args: Any, **kwargs: Any):
+    """Lazily import the full runner stack to avoid optional dependency churn at import time."""
+    from app.agent.runners import run_investigation as _run_investigation
+
+    return _run_investigation(*args, **kwargs)
+
+
+__all__ = ["run_investigation"]
@@ -325,21 +325,28 @@ def detect_sources(
         or annotations.get("correlation_id")
     )
 
-    # Only include Grafana when alert came from Grafana, or when source is truly unknown
+    # Only include Grafana when alert came from Grafana, or when source is truly unknown,
+    # or when a pre-injected backend is present (e.g. FixtureGrafanaBackend for synthetic tests).
     grafana_int = None
     grafana_local = False
-    if resolved_integrations and alert_source in ("grafana", ""):
+    if resolved_integrations:
         if resolved_integrations.get("grafana_local"):
             grafana_int = resolved_integrations["grafana_local"]
             grafana_local = True
         elif resolved_integrations.get("grafana"):
             grafana_int = resolved_integrations["grafana"]
 
+    # When a _backend is injected we allow any alert_source; otherwise restrict to Grafana/unknown.
+    _has_injected_backend = bool(grafana_int and "_backend" in grafana_int)
+    if grafana_int and not (_has_injected_backend or alert_source in ("grafana", "")):
+        grafana_int = None  # suppress real Grafana for non-Grafana alerts
+
     if grafana_int:
         endpoint = grafana_int.get("endpoint", "")
         api_key = grafana_int.get("api_key", "")
-        # Local Grafana uses anonymous auth (empty api_key is valid for localhost)
-        if endpoint and (api_key or grafana_local):
+        has_backend = "_backend" in grafana_int
+        # Local Grafana uses anonymous auth; injected backends don't need credentials at all.
+        if has_backend or (endpoint and (api_key or grafana_local)):
             service_name = _map_pipeline_to_service_name(pipeline_name) if pipeline_name else ""
 
             grafana_params: dict[str, Any] = {
@@ -353,6 +360,8 @@ def detect_sources(
             }
             if execution_run_id:
                 grafana_params["execution_run_id"] = execution_run_id
+            if has_backend:
+                grafana_params["_backend"] = grafana_int["_backend"]
             sources["grafana"] = grafana_params
 
     # Only include Datadog when alert came from Datadog, or when source is truly unknown
 
@@ -27,6 +27,13 @@
     "trace",
     "debug",
     "metrics",
+    "rds",
+    "postgres",
+    "database",
+    "replication",
+    "connections",
+    "storage",
+    "failover",
     "cpu",
     "disk",
     "resource",
 
@@ -46,7 +46,7 @@ def generate_report(state: InvestigationState) -> dict:
             logger.warning("[publish] ingest url update failed: %s", exc)
 
     all_blocks = build_slack_blocks(ctx) + build_action_blocks(investigation_url, investigation_id)
-    render_report(slack_message)
+    render_report(slack_message, root_cause_category=state.get("root_cause_category"))
     open_in_editor(slack_message)
 
     slack_ctx = state.get("slack_context", {})
 
@@ -106,7 +106,7 @@ def _render_rich_evidence_item(console: Console, line: str) -> None:
 # Main render entry points
 # ─────────────────────────────────────────────────────────────────────────────
 
-def render_report(slack_message: str) -> None:
+def render_report(slack_message: str, root_cause_category: str | None = None) -> None:
     """Render the final RCA report to terminal."""
     fmt = get_output_format()
 
@@ -118,19 +118,23 @@ def render_report(slack_message: str) -> None:
         return
 
     if fmt == "rich":
-        _render_rich_report(slack_message)
+        _render_rich_report(slack_message, root_cause_category=root_cause_category)
     else:
-        _render_plain_report(slack_message)
+        _render_plain_report(slack_message, root_cause_category=root_cause_category)
 
 
-def _render_rich_report(slack_message: str) -> None:
+def _render_rich_report(slack_message: str, root_cause_category: str | None = None) -> None:
     console = Console()
     console.print()
 
-    # Completion dot at the top
+    # Header varies by outcome
     done = Text()
-    done.append("  ● ", style="bold green")
-    done.append("Investigation complete", style="bold white")
+    if root_cause_category == "healthy":
+        done.append("  ✓ ", style="bold green")
+        done.append("Systems healthy", style="bold green")
+    else:
+        done.append("  ● ", style="bold green")
+        done.append("Investigation complete", style="bold white")
     console.print(done)
     console.print()
 
@@ -196,9 +200,12 @@ def _render_rich_report(slack_message: str) -> None:
     console.print()
 
 
-def _render_plain_report(slack_message: str) -> None:
+def _render_plain_report(slack_message: str, root_cause_category: str | None = None) -> None:
     print()
-    print("Investigation complete")
+    if root_cause_category == "healthy":
+        print("✓ Systems healthy")
+    else:
+        print("Investigation complete")
     print()
     clean = _strip_slack_links(_strip_mrkdwn(slack_message))
     print(clean)
@@ -23,6 +23,20 @@ def _has_any_logs(evidence: dict[str, Any]) -> bool:
     )
 
 
+def _has_rds_metrics(evidence: dict[str, Any]) -> bool:
+    metrics = evidence.get("rds_metrics", {})
+    return bool(metrics and (metrics.get("metrics") or metrics.get("observations")))
+
+
+def _has_rds_events(evidence: dict[str, Any]) -> bool:
+    return bool(evidence.get("rds_events"))
+
+
+def _has_performance_insights(evidence: dict[str, Any]) -> bool:
+    insights = evidence.get("performance_insights", {})
+    return bool(insights and (insights.get("top_sql") or insights.get("wait_events") or insights.get("observations")))
+
+
 def _datadog_logs_contain(evidence: dict[str, Any], keywords: tuple[str, ...]) -> bool:
     """Check if any Datadog log message contains at least one of the given keywords."""
     for log in evidence.get("datadog_error_logs", []) + evidence.get("datadog_logs", []):
@@ -43,10 +57,32 @@ def validate_claim(claim: str, evidence: dict[str, Any]) -> bool:
 
     if ("memory" in claim_lower or "cpu" in claim_lower) and not (
         evidence.get("host_metrics", {}).get("data")
+        or _has_rds_metrics(evidence)
+        or _has_performance_insights(evidence)
         or any(kw in claim_lower for kw in ("monitor", "datadog")) and has_dd
     ):
         return False
 
+    if any(
+        kw in claim_lower
+        for kw in (
+            "rds",
+            "postgres",
+            "database",
+            "replica",
+            "replication lag",
+            "connection",
+            "storage",
+            "disk",
+            "failover",
+            "reboot",
+        )
+    ) and not (_has_rds_metrics(evidence) or _has_rds_events(evidence) or _has_performance_insights(evidence)):
+        return False
+
+    if ("query" in claim_lower or "sql" in claim_lower or "wait event" in claim_lower) and not _has_performance_insights(evidence):
+        return False
+
     if ("job" in claim_lower or "batch" in claim_lower) and not (
         evidence.get("failed_jobs") or has_dd
     ):
@@ -104,6 +140,15 @@ def extract_evidence_sources(claim: str, evidence: dict[str, Any]) -> list[str]:
         sources.append("tracer_tools")
     if ("metric" in claim_lower or "memory" in claim_lower or "cpu" in claim_lower) and evidence.get("host_metrics", {}).get("data"):
         sources.append("host_metrics")
+    if any(
+        kw in claim_lower
+        for kw in ("metric", "replica", "replication lag", "connection", "storage", "disk", "database", "rds")
+    ) and _has_rds_metrics(evidence):
+        sources.append("rds_metrics")
+    if any(kw in claim_lower for kw in ("event", "failover", "reboot", "promotion", "availability zone")) and _has_rds_events(evidence):
+        sources.append("rds_events")
+    if any(kw in claim_lower for kw in ("query", "sql", "db load", "wait event", "cpu", "load")) and _has_performance_insights(evidence):
+        sources.append("performance_insights")
     if ("lambda" in claim_lower or "function" in claim_lower) and (
         evidence.get("lambda_logs") or evidence.get("lambda_function")
     ):
 
@@ -2,6 +2,29 @@
 
 from typing import Any
 
+# Alert state strings that indicate no active incident across common monitoring platforms.
+_HEALTHY_STATES = frozenset({"normal", "resolved", "ok"})
+
+# Severity levels that are non-actionable (i.e. scheduled checks, informational only).
+_HEALTHY_SEVERITIES = frozenset({"info", "none", ""})
+
+# Annotation keys whose non-empty presence signals an active error condition.
+_ERROR_ANNOTATION_KEYS = ("error", "error_message", "log_excerpt", "failed_steps")
+
+# Evidence keys whose presence (even with empty values) confirms investigation was attempted.
+# An empty grafana_logs list is itself a healthy signal: no errors found during investigation.
+_INVESTIGATED_EVIDENCE_KEYS = frozenset({
+    "grafana_logs",
+    "grafana_metrics",
+    "grafana_alert_rules",
+    "rds_metrics",
+    "rds_events",
+    "performance_insights",
+    "cloudwatch_logs",
+    "datadog_logs",
+    "datadog_monitors",
+})
+
 
 def check_evidence_availability(
     context: dict[str, Any], evidence: dict[str, Any], raw_alert: dict | str
@@ -34,6 +57,9 @@ def check_evidence_availability(
         or evidence.get("s3_marker")
         or evidence.get("lambda_function")
         or evidence.get("lambda_logs")
+        or evidence.get("rds_metrics")
+        or evidence.get("rds_events")
+        or evidence.get("performance_insights")
     )
 
     # Check for evidence in alert annotations or raw text
@@ -54,6 +80,54 @@ def check_evidence_availability(
     return has_tracer_evidence, has_cloudwatch_evidence, has_alert_evidence
 
 
+def is_clearly_healthy(raw_alert: dict[str, Any] | str, evidence: dict[str, Any]) -> bool:
+    """Return True only when all four conditions confirm no active incident.
+
+    Conditions (all must hold):
+    1. Alert ``state`` is in {"normal", "resolved", "ok"} — covers Grafana, CloudWatch,
+       PagerDuty, and most other monitoring platforms.
+    2. Alert ``severity`` is in {"info", "none", ""} — rules out a resolved-critical that
+       still warrants investigation.
+    3. No error-signal annotation keys (``error``, ``error_message``, ``log_excerpt``,
+       ``failed_steps``) are non-empty.
+    4. At least one evidence key is populated — distinguishes "healthy evidence" from
+       "no evidence gathered yet".
+
+    Blast radius if this misfires (false-healthy): the short-circuit returns
+    root_cause_category="healthy" without an LLM call. A real incident would receive a
+    "healthy" report. This is mitigated by:
+    - The severity gate: firing critical/high/warning alerts never satisfy condition 2.
+    - The HEALTHY_SHORT_CIRCUIT env flag (default "true") — set to "false" to disable
+      without a deploy.
+    """
+    if not isinstance(raw_alert, dict):
+        return False
+
+    # Condition 1: alert state signals no active incident.
+    state = str(raw_alert.get("state", "")).lower().strip()
+    if state not in _HEALTHY_STATES:
+        return False
+
+    # Condition 2: severity is non-actionable.
+    labels = raw_alert.get("commonLabels", raw_alert.get("labels", {})) or {}
+    severity = str(labels.get("severity", raw_alert.get("severity", ""))).lower().strip()
+    if severity not in _HEALTHY_SEVERITIES:
+        return False
+
+    # Condition 3: no error-signal annotations.
+    annotations = (
+        raw_alert.get("commonAnnotations", raw_alert.get("annotations", {})) or {}
+    )
+    if any(annotations.get(key) for key in _ERROR_ANNOTATION_KEYS):
+        return False
+
+    # Condition 4: at least one known investigation key exists in evidence (even if empty).
+    # An empty grafana_logs / grafana_metrics / etc. after a completed investigation is itself
+    # a health signal — it means no errors were found. We only require that the key is present
+    # (investigation was attempted), not that it contains data.
+    return any(k in evidence for k in _INVESTIGATED_EVIDENCE_KEYS)
+
+
 def check_vendor_evidence_missing(evidence: dict[str, Any]) -> bool:
     """
     Check if vendor/external API evidence is missing.