fix(ios): guard talk TTS callbacks to active utterance

mbelinky · ngutman · mbelinky · commit dd88886e416e · 2026-03-03T23:29:17.000+01:00
Co-authored-by: Nimrod Gutman &lt;nimrod.g@singular.net&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes
 
 - Docs/security hardening guidance: document Docker `DOCKER-USER` + UFW policy and add cross-linking from Docker install docs for VPS/public-host setups. (#27613) thanks @dorukardahan.
+- iOS/Voice timing safety: guard system speech start/finish callbacks to the active utterance to avoid misattributed start events during rapid stop/restart cycles. (#33304) thanks @mbelinky; original implementation direction by @ngutman.
 - Docs/tool-loop detection config keys: align `docs/tools/loop-detection.md` examples and field names with the current `tools.loopDetection` schema to prevent copy-paste validation failures from outdated keys. (#33182) Thanks @Mylszd.
 - Gateway/session agent discovery: include disk-scanned agent IDs in `listConfiguredAgentIds` even when `agents.list` is configured, so disk-only/ACP agent sessions remain visible in gateway session aggregation and listings. (#32831) thanks @Sid-Qin.
 - Discord/inbound debouncer: skip bot-own MESSAGE_CREATE events before they reach the debounce queue to avoid self-triggered slowdowns in busy servers. Thanks @thewilloftheshadow.
diff --git a/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkSystemSpeechSynthesizer.swift b/apps/shared/OpenClawKit/Sources/OpenClawKit/TalkSystemSpeechSynthesizer.swift
@@ -12,6 +12,7 @@ public final class TalkSystemSpeechSynthesizer: NSObject {
     private let synth = AVSpeechSynthesizer()
     private var speakContinuation: CheckedContinuation<Void, Error>?
     private var currentUtterance: AVSpeechUtterance?
+    private var didStartCallback: (() -> Void)?
     private var currentToken = UUID()
     private var watchdog: Task<Void, Never>?
 
@@ -26,17 +27,23 @@ public final class TalkSystemSpeechSynthesizer: NSObject {
         self.currentToken = UUID()
         self.watchdog?.cancel()
         self.watchdog = nil
+        self.didStartCallback = nil
         self.synth.stopSpeaking(at: .immediate)
         self.finishCurrent(with: SpeakError.canceled)
     }
 
-    public func speak(text: String, language: String? = nil) async throws {
+    public func speak(
+        text: String,
+        language: String? = nil,
+        onStart: (() -> Void)? = nil
+    ) async throws {
         let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
         guard !trimmed.isEmpty else { return }
 
         self.stop()
         let token = UUID()
         self.currentToken = token
+        self.didStartCallback = onStart
 
         let utterance = AVSpeechUtterance(string: trimmed)
         if let language, let voice = AVSpeechSynthesisVoice(language: language) {
@@ -76,15 +83,21 @@ public final class TalkSystemSpeechSynthesizer: NSObject {
         }
     }
 
-    private func handleFinish(error: Error?) {
-        guard self.currentUtterance != nil else { return }
+    private func matchesCurrentUtterance(_ utteranceID: ObjectIdentifier) -> Bool {
+        guard let currentUtterance = self.currentUtterance else { return false }
+        return ObjectIdentifier(currentUtterance) == utteranceID
+    }
+
+    private func handleFinish(utteranceID: ObjectIdentifier, error: Error?) {
+        guard self.matchesCurrentUtterance(utteranceID) else { return }
         self.watchdog?.cancel()
         self.watchdog = nil
         self.finishCurrent(with: error)
     }
 
     private func finishCurrent(with error: Error?) {
         self.currentUtterance = nil
+        self.didStartCallback = nil
         let cont = self.speakContinuation
         self.speakContinuation = nil
         if let error {
@@ -96,21 +109,36 @@ public final class TalkSystemSpeechSynthesizer: NSObject {
 }
 
 extension TalkSystemSpeechSynthesizer: AVSpeechSynthesizerDelegate {
+    public nonisolated func speechSynthesizer(
+        _ synthesizer: AVSpeechSynthesizer,
+        didStart utterance: AVSpeechUtterance)
+    {
+        let utteranceID = ObjectIdentifier(utterance)
+        Task { @MainActor in
+            guard self.matchesCurrentUtterance(utteranceID) else { return }
+            let callback = self.didStartCallback
+            self.didStartCallback = nil
+            callback?()
+        }
+    }
+
     public nonisolated func speechSynthesizer(
         _ synthesizer: AVSpeechSynthesizer,
         didFinish utterance: AVSpeechUtterance)
     {
+        let utteranceID = ObjectIdentifier(utterance)
         Task { @MainActor in
-            self.handleFinish(error: nil)
+            self.handleFinish(utteranceID: utteranceID, error: nil)
         }
     }
 
     public nonisolated func speechSynthesizer(
         _ synthesizer: AVSpeechSynthesizer,
         didCancel utterance: AVSpeechUtterance)
     {
+        let utteranceID = ObjectIdentifier(utterance)
         Task { @MainActor in
-            self.handleFinish(error: SpeakError.canceled)
+            self.handleFinish(utteranceID: utteranceID, error: SpeakError.canceled)
         }
     }
 }