Skip to content

Commit fbc26ef

Browse files
leepokaiclaude
andcommitted
fix(ios): auto-fallback from PCM to MP3 for ElevenLabs TTS
The default output format pcm_44100 requires an ElevenLabs Pro tier subscription. Users on free or starter plans get a silent 403 failure and hear no audio. Instead of hardcoding mp3, keep pcm_44100 as the default (better quality for Pro users) but remember the failure: when a PCM request is rejected, set pcmFormatUnavailable and use mp3_44100_128 for all subsequent requests in the session. The flag resets on config reload so it re-probes after reconnection. Also standardize the MP3 fallback format from mp3_44100 to mp3_44100_128 for consistent bitrate. Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent da0ba1b commit fbc26ef

File tree

1 file changed

+18
-5
lines changed

1 file changed

+18
-5
lines changed

apps/ios/Sources/Voice/TalkModeManager.swift

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ final class TalkModeManager: NSObject {
7272
private var mainSessionKey: String = "main"
7373
private var fallbackVoiceId: String?
7474
private var lastPlaybackWasPCM: Bool = false
75+
/// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required).
76+
/// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM.
77+
private var pcmFormatUnavailable: Bool = false
7578
var pcmPlayer: PCMStreamingAudioPlaying = PCMStreamingAudioPlayer.shared
7679
var mp3Player: StreamingAudioPlaying = StreamingAudioPlayer.shared
7780

@@ -1004,7 +1007,8 @@ final class TalkModeManager: NSObject {
10041007
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
10051008
.trimmingCharacters(in: .whitespacesAndNewlines)
10061009
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
1007-
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
1010+
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
1011+
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
10081012
if outputFormat == nil, let requestedOutputFormat {
10091013
self.logger.warning(
10101014
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1051,8 +1055,9 @@ final class TalkModeManager: NSObject {
10511055
self.lastPlaybackWasPCM = true
10521056
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
10531057
if !playback.finished, playback.interruptedAt == nil {
1054-
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
1058+
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
10551059
self.logger.warning("pcm playback failed; retrying mp3")
1060+
self.pcmFormatUnavailable = true
10561061
self.lastPlaybackWasPCM = false
10571062
let mp3Stream = client.streamSynthesize(
10581063
voiceId: voiceId,
@@ -1388,7 +1393,7 @@ final class TalkModeManager: NSObject {
13881393

13891394
private func resolveIncrementalPrefetchOutputFormat(context: IncrementalSpeechContext) -> String? {
13901395
if TalkTTSValidation.pcmSampleRate(from: context.outputFormat) != nil {
1391-
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
1396+
return ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
13921397
}
13931398
return context.outputFormat
13941399
}
@@ -1474,7 +1479,8 @@ final class TalkModeManager: NSObject {
14741479
let desiredOutputFormat = (directive?.outputFormat ?? self.defaultOutputFormat)?
14751480
.trimmingCharacters(in: .whitespacesAndNewlines)
14761481
let requestedOutputFormat = (desiredOutputFormat?.isEmpty == false) ? desiredOutputFormat : nil
1477-
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(requestedOutputFormat ?? "pcm_44100")
1482+
let outputFormat = ElevenLabsTTSClient.validatedOutputFormat(
1483+
requestedOutputFormat ?? self.effectiveDefaultOutputFormat)
14781484
if outputFormat == nil, let requestedOutputFormat {
14791485
self.logger.warning(
14801486
"talk output_format unsupported for local playback: \(requestedOutputFormat, privacy: .public)")
@@ -1525,6 +1531,11 @@ final class TalkModeManager: NSObject {
15251531
latencyTier: TalkTTSValidation.validatedLatencyTier(context.directive?.latencyTier))
15261532
}
15271533

1534+
/// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`.
1535+
private var effectiveDefaultOutputFormat: String {
1536+
self.pcmFormatUnavailable ? "mp3_44100_128" : "pcm_44100"
1537+
}
1538+
15281539
private static func makeBufferedAudioStream(chunks: [Data]) -> AsyncThrowingStream<Data, Error> {
15291540
AsyncThrowingStream { continuation in
15301541
for chunk in chunks {
@@ -1580,8 +1591,9 @@ final class TalkModeManager: NSObject {
15801591
var playback = await self.pcmPlayer.play(stream: stream, sampleRate: sampleRate)
15811592
if !playback.finished, playback.interruptedAt == nil {
15821593
self.logger.warning("pcm playback failed; retrying mp3")
1594+
self.pcmFormatUnavailable = true
15831595
self.lastPlaybackWasPCM = false
1584-
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100")
1596+
let mp3Format = ElevenLabsTTSClient.validatedOutputFormat("mp3_44100_128")
15851597
let mp3Stream = client.streamSynthesize(
15861598
voiceId: voiceId,
15871599
request: self.makeIncrementalTTSRequest(
@@ -1991,6 +2003,7 @@ extension TalkModeManager {
19912003
self.gatewayTalkDefaultModelId = nil
19922004
self.gatewayTalkApiKeyConfigured = false
19932005
self.gatewayTalkConfigLoaded = false
2006+
self.pcmFormatUnavailable = false
19942007
}
19952008
}
19962009

0 commit comments

Comments
 (0)