@@ -7,6 +7,23 @@ import Observation
77import OSLog
88import Speech
99
10+ private final class StreamFailureBox : @unchecked Sendable {
11+ private let lock = NSLock ( )
12+ private var valueInternal : Error ?
13+
14+ func set( _ error: Error ) {
15+ self . lock. lock ( )
16+ self . valueInternal = error
17+ self . lock. unlock ( )
18+ }
19+
20+ var value : Error ? {
21+ self . lock. lock ( )
22+ defer { self . lock. unlock ( ) }
23+ return self . valueInternal
24+ }
25+ }
26+
1027// This file intentionally centralizes talk mode state + behavior.
1128// It's large, and splitting would force `private` -> `fileprivate` across many members.
1229// We'll refactor into smaller files when the surface stabilizes.
@@ -72,6 +89,9 @@ final class TalkModeManager: NSObject {
7289 private var mainSessionKey : String = " main "
7390 private var fallbackVoiceId : String ?
7491 private var lastPlaybackWasPCM : Bool = false
92+ /// Set when the ElevenLabs API rejects PCM format (e.g. 403 subscription_required).
93+ /// Once set, all subsequent requests in this session use MP3 instead of re-trying PCM.
94+ private var pcmFormatUnavailable : Bool = false
7595 var pcmPlayer : PCMStreamingAudioPlaying = PCMStreamingAudioPlayer . shared
7696 var mp3Player : StreamingAudioPlaying = StreamingAudioPlayer . shared
7797
@@ -1007,7 +1027,8 @@ final class TalkModeManager: NSObject {
10071027 let desiredOutputFormat = ( directive? . outputFormat ?? self . defaultOutputFormat) ?
10081028 . trimmingCharacters ( in: . whitespacesAndNewlines)
10091029 let requestedOutputFormat = ( desiredOutputFormat? . isEmpty == false ) ? desiredOutputFormat : nil
1010- let outputFormat = ElevenLabsTTSClient . validatedOutputFormat ( requestedOutputFormat ?? " pcm_44100 " )
1030+ let outputFormat = ElevenLabsTTSClient . validatedOutputFormat (
1031+ requestedOutputFormat ?? self . effectiveDefaultOutputFormat)
10111032 if outputFormat == nil , let requestedOutputFormat {
10121033 self . logger. warning (
10131034 " talk output_format unsupported for local playback: \( requestedOutputFormat, privacy: . public) " )
@@ -1036,7 +1057,7 @@ final class TalkModeManager: NSObject {
10361057 let request = makeRequest ( outputFormat: outputFormat)
10371058
10381059 let client = ElevenLabsTTSClient ( apiKey: apiKey)
1039- let stream = client. streamSynthesize ( voiceId: voiceId, request: request)
1060+ let rawStream = client. streamSynthesize ( voiceId: voiceId, request: request)
10401061
10411062 if self . interruptOnSpeech {
10421063 do {
@@ -1051,11 +1072,16 @@ final class TalkModeManager: NSObject {
10511072 let sampleRate = TalkTTSValidation . pcmSampleRate ( from: outputFormat)
10521073 let result : StreamingPlaybackResult
10531074 if let sampleRate {
1075+ let streamFailure = StreamFailureBox ( )
1076+ let stream = Self . monitorStreamFailures ( rawStream, failureBox: streamFailure)
10541077 self . lastPlaybackWasPCM = true
10551078 var playback = await self . pcmPlayer. play ( stream: stream, sampleRate: sampleRate)
10561079 if !playback. finished, playback. interruptedAt == nil {
1057- let mp3Format = ElevenLabsTTSClient . validatedOutputFormat ( " mp3_44100 " )
1080+ let mp3Format = ElevenLabsTTSClient . validatedOutputFormat ( " mp3_44100_128 " )
10581081 self . logger. warning ( " pcm playback failed; retrying mp3 " )
1082+ if Self . isPCMFormatRejectedByAPI ( streamFailure. value) {
1083+ self . pcmFormatUnavailable = true
1084+ }
10591085 self . lastPlaybackWasPCM = false
10601086 let mp3Stream = client. streamSynthesize (
10611087 voiceId: voiceId,
@@ -1065,7 +1091,7 @@ final class TalkModeManager: NSObject {
10651091 result = playback
10661092 } else {
10671093 self . lastPlaybackWasPCM = false
1068- result = await self . mp3Player. play ( stream: stream )
1094+ result = await self . mp3Player. play ( stream: rawStream )
10691095 }
10701096 let duration = Date ( ) . timeIntervalSince ( started)
10711097 self . logger. info ( " elevenlabs stream finished= \( result. finished, privacy: . public) dur= \( duration, privacy: . public) s " )
@@ -1391,7 +1417,7 @@ final class TalkModeManager: NSObject {
13911417
13921418 private func resolveIncrementalPrefetchOutputFormat( context: IncrementalSpeechContext ) -> String ? {
13931419 if TalkTTSValidation . pcmSampleRate ( from: context. outputFormat) != nil {
1394- return ElevenLabsTTSClient . validatedOutputFormat ( " mp3_44100 " )
1420+ return ElevenLabsTTSClient . validatedOutputFormat ( " mp3_44100_128 " )
13951421 }
13961422 return context. outputFormat
13971423 }
@@ -1480,7 +1506,8 @@ final class TalkModeManager: NSObject {
14801506 let desiredOutputFormat = ( directive? . outputFormat ?? self . defaultOutputFormat) ?
14811507 . trimmingCharacters ( in: . whitespacesAndNewlines)
14821508 let requestedOutputFormat = ( desiredOutputFormat? . isEmpty == false ) ? desiredOutputFormat : nil
1483- let outputFormat = ElevenLabsTTSClient . validatedOutputFormat ( requestedOutputFormat ?? " pcm_44100 " )
1509+ let outputFormat = ElevenLabsTTSClient . validatedOutputFormat (
1510+ requestedOutputFormat ?? self . effectiveDefaultOutputFormat)
14841511 if outputFormat == nil , let requestedOutputFormat {
14851512 self . logger. warning (
14861513 " talk output_format unsupported for local playback: \( requestedOutputFormat, privacy: . public) " )
@@ -1534,6 +1561,44 @@ final class TalkModeManager: NSObject {
15341561 latencyTier: TalkTTSValidation . validatedLatencyTier ( context. directive? . latencyTier) )
15351562 }
15361563
1564+ /// Returns `mp3_44100_128` when the API has already rejected PCM, otherwise `pcm_44100`.
1565+ private var effectiveDefaultOutputFormat : String {
1566+ self . pcmFormatUnavailable ? " mp3_44100_128 " : " pcm_44100 "
1567+ }
1568+
1569+ private static func monitorStreamFailures(
1570+ _ stream: AsyncThrowingStream < Data , Error > ,
1571+ failureBox: StreamFailureBox
1572+ ) -> AsyncThrowingStream < Data , Error >
1573+ {
1574+ AsyncThrowingStream { continuation in
1575+ let task = Task {
1576+ do {
1577+ for try await chunk in stream {
1578+ continuation. yield ( chunk)
1579+ }
1580+ continuation. finish ( )
1581+ } catch {
1582+ failureBox. set ( error)
1583+ continuation. finish ( throwing: error)
1584+ }
1585+ }
1586+ continuation. onTermination = { _ in
1587+ task. cancel ( )
1588+ }
1589+ }
1590+ }
1591+
1592+ private static func isPCMFormatRejectedByAPI( _ error: Error ? ) -> Bool {
1593+ guard let error = error as NSError ? else { return false }
1594+ guard error. domain == " ElevenLabsTTS " , error. code >= 400 else { return false }
1595+ let message = ( error. userInfo [ NSLocalizedDescriptionKey] as? String ?? error. localizedDescription) . lowercased ( )
1596+ return message. contains ( " output_format " )
1597+ || message. contains ( " pcm_ " )
1598+ || message. contains ( " pcm " )
1599+ || message. contains ( " subscription_required " )
1600+ }
1601+
15371602 private static func makeBufferedAudioStream( chunks: [ Data ] ) -> AsyncThrowingStream < Data , Error > {
15381603 AsyncThrowingStream { continuation in
15391604 for chunk in chunks {
@@ -1575,22 +1640,27 @@ final class TalkModeManager: NSObject {
15751640 text: text,
15761641 context: context,
15771642 outputFormat: context. outputFormat)
1578- let stream : AsyncThrowingStream < Data , Error >
1643+ let rawStream : AsyncThrowingStream < Data , Error >
15791644 if let prefetchedAudio, !prefetchedAudio. chunks. isEmpty {
1580- stream = Self . makeBufferedAudioStream ( chunks: prefetchedAudio. chunks)
1645+ rawStream = Self . makeBufferedAudioStream ( chunks: prefetchedAudio. chunks)
15811646 } else {
1582- stream = client. streamSynthesize ( voiceId: voiceId, request: request)
1647+ rawStream = client. streamSynthesize ( voiceId: voiceId, request: request)
15831648 }
15841649 let playbackFormat = prefetchedAudio? . outputFormat ?? context. outputFormat
15851650 let sampleRate = TalkTTSValidation . pcmSampleRate ( from: playbackFormat)
15861651 let result : StreamingPlaybackResult
15871652 if let sampleRate {
1653+ let streamFailure = StreamFailureBox ( )
1654+ let stream = Self . monitorStreamFailures ( rawStream, failureBox: streamFailure)
15881655 self . lastPlaybackWasPCM = true
15891656 var playback = await self . pcmPlayer. play ( stream: stream, sampleRate: sampleRate)
15901657 if !playback. finished, playback. interruptedAt == nil {
15911658 self . logger. warning ( " pcm playback failed; retrying mp3 " )
1659+ if Self . isPCMFormatRejectedByAPI ( streamFailure. value) {
1660+ self . pcmFormatUnavailable = true
1661+ }
15921662 self . lastPlaybackWasPCM = false
1593- let mp3Format = ElevenLabsTTSClient . validatedOutputFormat ( " mp3_44100 " )
1663+ let mp3Format = ElevenLabsTTSClient . validatedOutputFormat ( " mp3_44100_128 " )
15941664 let mp3Stream = client. streamSynthesize (
15951665 voiceId: voiceId,
15961666 request: self . makeIncrementalTTSRequest (
@@ -1602,7 +1672,7 @@ final class TalkModeManager: NSObject {
16021672 result = playback
16031673 } else {
16041674 self . lastPlaybackWasPCM = false
1605- result = await self . mp3Player. play ( stream: stream )
1675+ result = await self . mp3Player. play ( stream: rawStream )
16061676 }
16071677 if !result. finished, let interruptedAt = result. interruptedAt {
16081678 self . lastInterruptedAtSeconds = interruptedAt
@@ -1926,6 +1996,7 @@ extension TalkModeManager {
19261996
19271997 func reloadConfig( ) async {
19281998 guard let gateway else { return }
1999+ self . pcmFormatUnavailable = false
19292000 do {
19302001 let res = try await gateway. request (
19312002 method: " talk.config " ,
@@ -2105,6 +2176,10 @@ private final class AudioTapDiagnostics: @unchecked Sendable {
21052176
21062177#if DEBUG
21072178extension TalkModeManager {
2179+ static func _test_isPCMFormatRejectedByAPI( _ error: Error ? ) -> Bool {
2180+ self . isPCMFormatRejectedByAPI ( error)
2181+ }
2182+
21082183 func _test_seedTranscript( _ transcript: String ) {
21092184 self . lastTranscript = transcript
21102185 self . lastHeard = Date ( )
0 commit comments