fix(metrics): propagate filter_stats in self-reflection remaining-tools loops

bug-ops · bug-ops · commit fe715d9863d1 · 2026-03-17T14:49:01.000+01:00
In the native tool execution path, two "remaining tools" loops (self-reflection success path and error path) discarded filter_stats from tool outputs, sending ToolOutputEvent with filter_stats: None and skipping metric updates. This caused filter_raw_tokens, filter_saved_tokens, and filter_applications to remain zero in the TUI even when tools produced filtered output. Both loops now extract filter_stats from Ok(Some(out)) results, update filter_* metrics (same pattern as the normal processing loop), and pass inline_stats to ToolOutputEvent. Adds a regression test that verifies filter_* metrics increment when the native tool path processes a tool returning FilterStats. Closes #1939
diff --git a/crates/zeph-core/src/agent/tests.rs b/crates/zeph-core/src/agent/tests.rs
@@ -4226,6 +4226,86 @@ mod shutdown_summary_tests {
         );
     }
 
+    // Tests for filter_stats metric propagation (issue #1939).
+    // The normal native tool path (single tool call) must increment filter_* metrics when the
+    // tool returns FilterStats.
+
+    #[tokio::test]
+    async fn filter_stats_metrics_increment_on_normal_native_tool_path() {
+        use crate::metrics::MetricsSnapshot;
+        use tokio::sync::watch;
+        use zeph_llm::mock::MockProvider;
+        use zeph_llm::provider::{ChatResponse, ToolUseRequest};
+        use zeph_tools::executor::{FilterStats, ToolCall, ToolError, ToolExecutor, ToolOutput};
+
+        struct FilteredToolExecutor;
+
+        impl ToolExecutor for FilteredToolExecutor {
+            async fn execute(&self, _response: &str) -> Result<Option<ToolOutput>, ToolError> {
+                Ok(None)
+            }
+
+            async fn execute_tool_call(
+                &self,
+                _call: &ToolCall,
+            ) -> Result<Option<ToolOutput>, ToolError> {
+                Ok(Some(ToolOutput {
+                    tool_name: "shell".to_owned(),
+                    summary: "filtered output".to_owned(),
+                    blocks_executed: 1,
+                    filter_stats: Some(FilterStats {
+                        raw_chars: 400,
+                        filtered_chars: 200,
+                        raw_lines: 20,
+                        filtered_lines: 10,
+                        confidence: None,
+                        command: None,
+                        kept_lines: vec![],
+                    }),
+                    diff: None,
+                    streamed: false,
+                    terminal_id: None,
+                    locations: None,
+                    raw_response: None,
+                }))
+            }
+        }
+
+        let (mock, _counter) = MockProvider::default().with_tool_use(vec![
+            ChatResponse::ToolUse {
+                text: None,
+                tool_calls: vec![ToolUseRequest {
+                    id: "call-1".to_owned(),
+                    name: "shell".to_owned(),
+                    input: serde_json::json!({"cmd": "ls"}),
+                }],
+                thinking_blocks: vec![],
+            },
+            ChatResponse::Text("done".to_owned()),
+        ]);
+        let provider = AnyProvider::Mock(mock);
+        let channel = MockChannel::new(vec!["run a tool".to_owned()]);
+        let registry = create_test_registry();
+        let executor = FilteredToolExecutor;
+        let (tx, rx) = watch::channel(MetricsSnapshot::default());
+
+        let mut agent = Agent::new(provider, channel, registry, None, 5, executor).with_metrics(tx);
+        agent.run().await.expect("agent run must succeed");
+
+        let snap: MetricsSnapshot = rx.borrow().clone();
+        assert!(
+            snap.filter_applications > 0,
+            "filter_applications must be > 0"
+        );
+        assert!(snap.filter_raw_tokens > 0, "filter_raw_tokens must be > 0");
+        assert!(
+            snap.filter_saved_tokens > 0,
+            "filter_saved_tokens must be > 0"
+        );
+        assert_eq!(snap.filter_total_commands, 1);
+        assert_eq!(snap.filter_filtered_commands, 1);
+    }
+
     // Regression test for issue #1910: corrections must be stored in user_corrections even when
     // LearningConfig::enabled = false (skill auto-improvement is disabled).
     #[tokio::test]
diff --git a/crates/zeph-core/src/agent/tool_execution/native.rs b/crates/zeph-core/src/agent/tool_execution/native.rs
@@ -1271,24 +1271,59 @@ impl<C: Channel> Agent<C> {
                                 let remaining_tc = &tool_calls[remaining_idx];
                                 let remaining_result =
                                     std::mem::replace(&mut tool_results[remaining_idx], Ok(None));
-                                let (remaining_content, remaining_is_error) = match remaining_result
-                                {
-                                    Ok(Some(ref out)) => {
-                                        let (sanitized, _) = self
-                                            .sanitize_tool_output(&out.summary, &remaining_tc.name)
-                                            .await;
-                                        (sanitized, false)
-                                    }
-                                    Ok(None) => ("(no output)".to_owned(), false),
-                                    Err(ref e) => (format!("[error] {e}"), true),
-                                };
+                                let (remaining_content, remaining_is_error, remaining_inline_stats) =
+                                    match remaining_result {
+                                        Ok(Some(ref out)) => {
+                                            let (sanitized, _) = self
+                                                .sanitize_tool_output(
+                                                    &out.summary,
+                                                    &remaining_tc.name,
+                                                )
+                                                .await;
+                                            if let Some(ref fs) = out.filter_stats {
+                                                let saved = fs.estimated_tokens_saved() as u64;
+                                                let raw = (fs.raw_chars / 4) as u64;
+                                                let confidence = fs.confidence;
+                                                let was_filtered = fs.filtered_chars < fs.raw_chars;
+                                                self.update_metrics(|m| {
+                                                    m.filter_raw_tokens += raw;
+                                                    m.filter_saved_tokens += saved;
+                                                    m.filter_applications += 1;
+                                                    m.filter_total_commands += 1;
+                                                    if was_filtered {
+                                                        m.filter_filtered_commands += 1;
+                                                    }
+                                                    if let Some(c) = confidence {
+                                                        match c {
+                                                            zeph_tools::FilterConfidence::Full => {
+                                                                m.filter_confidence_full += 1;
+                                                            }
+                                                            zeph_tools::FilterConfidence::Partial => {
+                                                                m.filter_confidence_partial += 1;
+                                                            }
+                                                            zeph_tools::FilterConfidence::Fallback => {
+                                                                m.filter_confidence_fallback += 1;
+                                                            }
+                                                        }
+                                                    }
+                                                });
+                                            }
+                                            let inline = out.filter_stats.as_ref().and_then(|fs| {
+                                                (fs.filtered_chars < fs.raw_chars)
+                                                    .then(|| fs.format_inline(&remaining_tc.name))
+                                            });
+                                            (sanitized, false, inline)
+                                        }
+                                        Ok(None) => ("(no output)".to_owned(), false, None),
+                                        Err(ref e) => (format!("[error] {e}"), true, None),
+                                    };
                                 let body_display = self.maybe_redact(&remaining_content);
                                 self.channel
                                     .send_tool_output(ToolOutputEvent {
                                         tool_name: &remaining_tc.name,
                                         body: &body_display,
                                         diff: None,
-                                        filter_stats: None,
+                                        filter_stats: remaining_inline_stats,
                                         kept_lines: None,
                                         locations: None,
                                         tool_call_id: &tool_call_ids[remaining_idx],
@@ -1332,24 +1367,59 @@ impl<C: Channel> Agent<C> {
                                 let remaining_tc = &tool_calls[remaining_idx];
                                 let remaining_result =
                                     std::mem::replace(&mut tool_results[remaining_idx], Ok(None));
-                                let (remaining_content, remaining_is_error) = match remaining_result
-                                {
-                                    Ok(Some(ref out)) => {
-                                        let (sanitized, _) = self
-                                            .sanitize_tool_output(&out.summary, &remaining_tc.name)
-                                            .await;
-                                        (sanitized, false)
-                                    }
-                                    Ok(None) => ("(no output)".to_owned(), false),
-                                    Err(ref re) => (format!("[error] {re}"), true),
-                                };
+                                let (remaining_content, remaining_is_error, remaining_inline_stats) =
+                                    match remaining_result {
+                                        Ok(Some(ref out)) => {
+                                            let (sanitized, _) = self
+                                                .sanitize_tool_output(
+                                                    &out.summary,
+                                                    &remaining_tc.name,
+                                                )
+                                                .await;
+                                            if let Some(ref fs) = out.filter_stats {
+                                                let saved = fs.estimated_tokens_saved() as u64;
+                                                let raw = (fs.raw_chars / 4) as u64;
+                                                let confidence = fs.confidence;
+                                                let was_filtered = fs.filtered_chars < fs.raw_chars;
+                                                self.update_metrics(|m| {
+                                                    m.filter_raw_tokens += raw;
+                                                    m.filter_saved_tokens += saved;
+                                                    m.filter_applications += 1;
+                                                    m.filter_total_commands += 1;
+                                                    if was_filtered {
+                                                        m.filter_filtered_commands += 1;
+                                                    }
+                                                    if let Some(c) = confidence {
+                                                        match c {
+                                                            zeph_tools::FilterConfidence::Full => {
+                                                                m.filter_confidence_full += 1;
+                                                            }
+                                                            zeph_tools::FilterConfidence::Partial => {
+                                                                m.filter_confidence_partial += 1;
+                                                            }
+                                                            zeph_tools::FilterConfidence::Fallback => {
+                                                                m.filter_confidence_fallback += 1;
+                                                            }
+                                                        }
+                                                    }
+                                                });
+                                            }
+                                            let inline = out.filter_stats.as_ref().and_then(|fs| {
+                                                (fs.filtered_chars < fs.raw_chars)
+                                                    .then(|| fs.format_inline(&remaining_tc.name))
+                                            });
+                                            (sanitized, false, inline)
+                                        }
+                                        Ok(None) => ("(no output)".to_owned(), false, None),
+                                        Err(ref re) => (format!("[error] {re}"), true, None),
+                                    };
                                 let body_display = self.maybe_redact(&remaining_content);
                                 self.channel
                                     .send_tool_output(ToolOutputEvent {
                                         tool_name: &remaining_tc.name,
                                         body: &body_display,
                                         diff: None,
-                                        filter_stats: None,
+                                        filter_stats: remaining_inline_stats,
                                         kept_lines: None,
                                         locations: None,
                                         tool_call_id: &tool_call_ids[remaining_idx],