@@ -27,6 +27,7 @@ option go_package = "cloud.google.com/go/texttospeech/apiv1beta1/texttospeechpb;
2727option java_multiple_files = true ;
2828option java_outer_classname = "TextToSpeechProto" ;
2929option java_package = "com.google.cloud.texttospeech.v1beta1" ;
30+ option objc_class_prefix = "CTTS" ;
3031option php_namespace = "Google\\Cloud\\TextToSpeech\\V1beta1" ;
3132option ruby_package = "Google::Cloud::TextToSpeech::V1beta1" ;
3233option (google.api.resource_definition ) = {
@@ -58,6 +59,11 @@ service TextToSpeech {
5859 };
5960 option (google.api.method_signature ) = "input,voice,audio_config" ;
6061 }
62+
63+ // Performs bidirectional streaming speech synthesis: receive audio while
64+ // sending text.
65+ rpc StreamingSynthesize (stream StreamingSynthesizeRequest )
66+ returns (stream StreamingSynthesizeResponse ) {}
6167}
6268
6369// Gender of the voice as described in
@@ -206,8 +212,9 @@ message VoiceSelectionParams {
206212 // Bokmal) instead of "no" (Norwegian)".
207213 string language_code = 1 [(google.api.field_behavior ) = REQUIRED ];
208214
209- // The name of the voice. If not set, the service will choose a
210- // voice based on the other parameters such as language_code and gender.
215+ // The name of the voice. If both the name and the gender are not set,
216+ // the service will choose a voice based on the other parameters such as
217+ // language_code.
211218 string name = 2 ;
212219
213220 // The preferred gender of the voice. If not set, the service will
@@ -334,3 +341,48 @@ message Timepoint {
334341 // Time offset in seconds from the start of the synthesized audio.
335342 double time_seconds = 3 ;
336343}
344+
345+ // Provides configuration information for the StreamingSynthesize request.
346+ message StreamingSynthesizeConfig {
347+ // Required. The desired voice of the synthesized audio.
348+ VoiceSelectionParams voice = 1 [(google.api.field_behavior ) = REQUIRED ];
349+ }
350+
351+ // Input to be synthesized.
352+ message StreamingSynthesisInput {
353+ oneof input_source {
354+ // The raw text to be synthesized. It is recommended that each input
355+ // contains complete, terminating sentences, as this will likely result in
356+ // better prosody in the output audio. That being said, users are free to
357+ // input text however they please.
358+ string text = 1 ;
359+ }
360+ }
361+
362+ // Request message for the `StreamingSynthesize` method. Multiple
363+ // `StreamingSynthesizeRequest` messages are sent in one call.
364+ // The first message must contain a `streaming_config` that
365+ // fully specifies the request configuration and must not contain `input`. All
366+ // subsequent messages must only have `input` set.
367+ message StreamingSynthesizeRequest {
368+ // The request to be sent, either a StreamingSynthesizeConfig or
369+ // StreamingSynthesisInput.
370+ oneof streaming_request {
371+ // StreamingSynthesizeConfig to be used in this streaming attempt. Only
372+ // specified in the first message sent in a `StreamingSynthesize` call.
373+ StreamingSynthesizeConfig streaming_config = 1 ;
374+
375+ // Input to synthesize. Specified in all messages but the first in a
376+ // `StreamingSynthesize` call.
377+ StreamingSynthesisInput input = 2 ;
378+ }
379+ }
380+
381+ // `StreamingSynthesizeResponse` is the only message returned to the
382+ // client by `StreamingSynthesize` method. A series of zero or more
383+ // `StreamingSynthesizeResponse` messages are streamed back to the client.
384+ message StreamingSynthesizeResponse {
385+ // The audio data bytes encoded as specified in the request. This is
386+ // headerless LINEAR16 audio with a sample rate of 24000.
387+ bytes audio_content = 1 ;
388+ }
0 commit comments