Skip to content

Commit bf70610

Browse files
authored
iOS Security Stack 4/5: TTS PCM->MP3 Fallback (#30885) (#33032)
Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: f77e3d7 Co-authored-by: mbelinky <[email protected]> Co-authored-by: mbelinky <[email protected]> Reviewed-by: @mbelinky
1 parent d493861 commit bf70610

File tree

3 files changed

+106
-11
lines changed

3 files changed

+106
-11
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Docs: https://docs.openclaw.ai
2626
- iOS/Gateway keychain hardening: move gateway metadata and TLS fingerprints to device keychain storage with safer migration behavior and rollback-safe writes to reduce credential loss risk during upgrades. (#33029) thanks @mbelinky.
2727
- iOS/Concurrency stability: replace risky shared-state access in camera and gateway connection paths with lock-protected access patterns to reduce crash risk under load. (#33241) thanks @mbelinky.
2828
- iOS/Security guardrails: limit production API-key sourcing to app config and make deep-link confirmation prompts safer by coalescing queued requests instead of silently dropping them. (#33031) thanks @mbelinky.
29+
- iOS/TTS playback fallback: keep voice playback resilient by switching from PCM to MP3 when provider format support is unavailable, while avoiding sticky fallback on generic local playback errors. (#33032) thanks @mbelinky.
2930
- Telegram/multi-account default routing clarity: warn only for ambiguous (2+) account setups without an explicit default, add `openclaw doctor` warnings for missing/invalid multi-account defaults across channels, and document explicit-default guidance for channel routing and Telegram config. (#32544) thanks @Sid-Qin.
3031
- Telegram/plugin outbound hook parity: run `message_sending` + `message_sent` in Telegram reply delivery, include reply-path hook metadata (`mediaUrls`, `threadId`), and report `message_sent.success=false` when hooks blank text and no outbound message is delivered. (#32649) Thanks @KimGLee.
3132
- Agents/Skills runtime loading: propagate run config into embedded attempt and compaction skill-entry loading so explicitly enabled bundled companion skills are discovered consistently when skill snapshots do not already provide resolved entries. Thanks @gumadeiras.

apps/ios/Sources/Voice/TalkModeManager.swift

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,23 @@ import Observation
77
import OSLog
88
import Speech
99

10+
private final class StreamFailureBox: @unchecked Sendable {
11+
private let lock = NSLock()
12+
private var valueInternal: Error?
13+
14+
func set(_ error: Error) {
15+
self.lock.lock()
16+
self.valueInternal = error
17+
self.lock.unlock()
18+
}
19+
20+
var value: Error? {
21+
self.lock.lock()
22+
defer { self.lock.unlock() }
23+
return self.valueInternal
24+
}
25+
}
26+
1027
// This file intentionally centralizes talk mode state + behavior.
1128
// It's large, and splitting would force `private` -> `fileprivate` across many members.
1229
// We'll refactor into smaller files when the surface stabilizes.
@@ -72,6 +89,9 @@ final class TalkModeManager: NSObject {
7289
private var mainSessionKey: String = "main"
7390
private var fallbackVoiceId: String?
7491
private var lastPlaybackWasPCM: Bool = false
92+
/// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required).
93+
/// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM.
94+
private var pcmFormatUnavailable: Bool = false
7595
var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared
7696
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
7797

@@ -1007,7 +1027,8 @@ final class TalkModeManager: NSObject {
10071027
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
10081028
.trimmingCharacters(in: .whitespacesAndNewlines)
10091029
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
1010-
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
1030+
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
1031+
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
10111032
if outputFormat == nil, let requestedOutputFormat {
10121033
self.logger.warning(
10131034
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1036,7 +1057,7 @@ final class TalkModeManager: NSObject {
10361057
let request = makeRequest(outputFormat: outputFormat)
10371058

10381059
let client = ElevenLabsTTSClient(apiKey: apiKey)
1039-
let stream = client.streamSynthesize(voiceId: voiceId, request: request)
1060+
let rawStream = client.streamSynthesize(voiceId: voiceId, request: request)
10401061

10411062
if self.interruptOnSpeech {
10421063
do {
@@ -1051,11 +1072,16 @@ final class TalkModeManager: NSObject {
10511072
let sampleRate = TalkTTSValidation.pcmSampleRate(from: outputFormat)
10521073
let result: StreamingPlaybackResult
10531074
if let sampleRate {
1075+
let streamFailure = StreamFailureBox()
1076+
let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure)
10541077
self.lastPlaybackWasPCM = true
10551078
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
10561079
if !playback.finished, playback.interruptedAt == nil {
1057-
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
1080+
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
10581081
self.logger.warning("pcm playback failed; retrying mp3")
1082+
if Self.isPCMFormatRejectedByAPI(streamFailure.value) {
1083+
self.pcmFormatUnavailable = true
1084+
}
10591085
self.lastPlaybackWasPCM = false
10601086
let mp3Stream = client.streamSynthesize(
10611087
voiceId: voiceId,
@@ -1065,7 +1091,7 @@ final class TalkModeManager: NSObject {
10651091
result = playback
10661092
} else {
10671093
self.lastPlaybackWasPCM = false
1068-
result = await self.mp3Player.play(stream: stream)
1094+
result = await self.mp3Player.play(stream: rawStream)
10691095
}
10701096
let duration = Date().timeIntervalSince(started)
10711097
self.logger.info("elevenlabs stream finished=\(result.finished, privacy: .public) dur=\(duration, privacy: .public)s")
@@ -1391,7 +1417,7 @@ final class TalkModeManager: NSObject {
13911417

13921418
private func resolveIncrementalPrefetchOutputFormat(context: IncrementalSpeechContext) -> String? {
13931419
if TalkTTSValidation.pcmSampleRate(from: context.outputFormat) != nil {
1394-
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
1420+
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
13951421
}
13961422
return context.outputFormat
13971423
}
@@ -1480,7 +1506,8 @@ final class TalkModeManager: NSObject {
14801506
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
14811507
.trimmingCharacters(in: .whitespacesAndNewlines)
14821508
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
1483-
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
1509+
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
1510+
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
14841511
if outputFormat == nil, let requestedOutputFormat {
14851512
self.logger.warning(
14861513
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1534,6 +1561,44 @@ final class TalkModeManager: NSObject {
15341561
latencyTier: TalkTTSValidation.validatedLatencyTier(context.directive?.latencyTier))
15351562
}
15361563

1564+
/// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`.
1565+
private var effectiveDefaultOutputFormat: String {
1566+
self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100"
1567+
}
1568+
1569+
private static func monitorStreamFailures(
1570+
_ stream: AsyncThrowingStream<Data, Error>,
1571+
failureBox: StreamFailureBox
1572+
) -> AsyncThrowingStream<Data, Error>
1573+
{
1574+
AsyncThrowingStream { continuation in
1575+
let task = Task {
1576+
do {
1577+
for try await chunk in stream {
1578+
continuation.yield(chunk)
1579+
}
1580+
continuation.finish()
1581+
} catch {
1582+
failureBox.set(error)
1583+
continuation.finish(throwing: error)
1584+
}
1585+
}
1586+
continuation.onTermination = { _ in
1587+
task.cancel()
1588+
}
1589+
}
1590+
}
1591+
1592+
private static func isPCMFormatRejectedByAPI(_ error: Error?) -> Bool {
1593+
guard let error = error as NSError? else { return false }
1594+
guard error.domain == "ElevenLabsTTS", error.code >= 400 else { return false }
1595+
let message = (error.userInfo[NSLocalizedDescriptionKey] as? String ?? error.localizedDescription).lowercased()
1596+
return message.contains("output_format")
1597+
|| message.contains("pcm_")
1598+
|| message.contains("pcm ")
1599+
|| message.contains("subscription_required")
1600+
}
1601+
15371602
private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream<Data, Error> {
15381603
AsyncThrowingStream { continuation in
15391604
for chunk in chunks {
@@ -1575,22 +1640,27 @@ final class TalkModeManager: NSObject {
15751640
text: text,
15761641
context: context,
15771642
outputFormat: context.outputFormat)
1578-
let stream: AsyncThrowingStream<Data, Error>
1643+
let rawStream: AsyncThrowingStream<Data, Error>
15791644
if let prefetchedAudio, !prefetchedAudio.chunks.isEmpty {
1580-
stream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks)
1645+
rawStream = Self.makeBufferedAudioStream(chunks: prefetchedAudio.chunks)
15811646
} else {
1582-
stream = client.streamSynthesize(voiceId: voiceId, request: request)
1647+
rawStream = client.streamSynthesize(voiceId: voiceId, request: request)
15831648
}
15841649
let playbackFormat = prefetchedAudio?.outputFormat ?? context.outputFormat
15851650
let sampleRate = TalkTTSValidation.pcmSampleRate(from: playbackFormat)
15861651
let result: StreamingPlaybackResult
15871652
if let sampleRate {
1653+
let streamFailure = StreamFailureBox()
1654+
let stream = Self.monitorStreamFailures(rawStream, failureBox: streamFailure)
15881655
self.lastPlaybackWasPCM = true
15891656
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
15901657
if !playback.finished, playback.interruptedAt == nil {
15911658
self.logger.warning("pcm playback failed; retrying mp3")
1659+
if Self.isPCMFormatRejectedByAPI(streamFailure.value) {
1660+
self.pcmFormatUnavailable = true
1661+
}
15921662
self.lastPlaybackWasPCM = false
1593-
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
1663+
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
15941664
let mp3Stream = client.streamSynthesize(
15951665
voiceId: voiceId,
15961666
request: self.makeIncrementalTTSRequest(
@@ -1602,7 +1672,7 @@ final class TalkModeManager: NSObject {
16021672
result = playback
16031673
} else {
16041674
self.lastPlaybackWasPCM = false
1605-
result = await self.mp3Player.play(stream: stream)
1675+
result = await self.mp3Player.play(stream: rawStream)
16061676
}
16071677
if !result.finished, let interruptedAt = result.interruptedAt {
16081678
self.lastInterruptedAtSeconds = interruptedAt
@@ -1926,6 +1996,7 @@ extension TalkModeManager {
19261996

19271997
func reloadConfig() async {
19281998
guard let gateway else { return }
1999+
self.pcmFormatUnavailable = false
19292000
do {
19302001
let res = try await gateway.request(
19312002
method: "talk.config",
@@ -2105,6 +2176,10 @@ private final class AudioTapDiagnostics: @unchecked Sendable {
21052176

21062177
#if DEBUG
21072178
extension TalkModeManager {
2179+
static func _test_isPCMFormatRejectedByAPI(_ error: Error?) -> Bool {
2180+
self.isPCMFormatRejectedByAPI(error)
2181+
}
2182+
21082183
func _test_seedTranscript(_ transcript: String) {
21092184
self.lastTranscript = transcript
21102185
self.lastHeard = Date()

apps/ios/Tests/TalkModeConfigParsingTests.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import Foundation
12
import Testing
23
@testable import OpenClaw
34

@@ -28,4 +29,22 @@ import Testing
2829
let selection = TalkModeManager.selectTalkProviderConfig(talk)
2930
#expect(selection == nil)
3031
}
32+
33+
@Test func detectsPCMFormatRejectionFromElevenLabsError() {
34+
let error = NSError(
35+
domain: "ElevenLabsTTS",
36+
code: 403,
37+
userInfo: [
38+
NSLocalizedDescriptionKey: "ElevenLabs failed: 403 subscription_required output_format=pcm_44100",
39+
])
40+
#expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error))
41+
}
42+
43+
@Test func ignoresGenericPlaybackFailuresForPCMFormatRejection() {
44+
let error = NSError(
45+
domain: "StreamingAudio",
46+
code: -1,
47+
userInfo: [NSLocalizedDescriptionKey: "queue enqueue failed"])
48+
#expect(TalkModeManager._test_isPCMFormatRejectedByAPI(error) == false)
49+
}
3150
}

0 commit comments

Comments
 (0)