@@ -27,6 +27,7 @@ option go_package = "cloud.google.com/go/texttospeech/apiv1/texttospeechpb;textt
2727option java_multiple_files = true ;
2828option java_outer_classname = "TextToSpeechProto" ;
2929option java_package = "com.google.cloud.texttospeech.v1" ;
30+ option objc_class_prefix = "CTTS" ;
3031option php_namespace = "Google\\Cloud\\TextToSpeech\\V1" ;
3132option ruby_package = "Google::Cloud::TextToSpeech::V1" ;
3233option (google.api.resource_definition ) = {
@@ -58,6 +59,11 @@ service TextToSpeech {
5859 };
5960 option (google.api.method_signature ) = "input,voice,audio_config" ;
6061 }
62+
63+ // Performs bidirectional streaming speech synthesis: receive audio while
64+ // sending text.
65+ rpc StreamingSynthesize (stream StreamingSynthesizeRequest )
66+ returns (stream StreamingSynthesizeResponse ) {}
6167}
6268
6369// Gender of the voice as described in
@@ -191,8 +197,9 @@ message VoiceSelectionParams {
191197 // Bokmal) instead of "no" (Norwegian)".
192198 string language_code = 1 [(google.api.field_behavior ) = REQUIRED ];
193199
194- // The name of the voice. If not set, the service will choose a
195- // voice based on the other parameters such as language_code and gender.
200+ // The name of the voice. If both the name and the gender are not set,
201+ // the service will choose a voice based on the other parameters such as
202+ // language_code.
196203 string name = 2 ;
197204
198205 // The preferred gender of the voice. If not set, the service will
@@ -302,3 +309,48 @@ message SynthesizeSpeechResponse {
302309 // whereas JSON representations use base64.
303310 bytes audio_content = 1 ;
304311}
312+
313+ // Provides configuration information for the StreamingSynthesize request.
314+ message StreamingSynthesizeConfig {
315+ // Required. The desired voice of the synthesized audio.
316+ VoiceSelectionParams voice = 1 [(google.api.field_behavior ) = REQUIRED ];
317+ }
318+
319+ // Input to be synthesized.
320+ message StreamingSynthesisInput {
321+ oneof input_source {
322+ // The raw text to be synthesized. It is recommended that each input
323+ // contains complete, terminating sentences, as this will likely result in
324+ // better prosody in the output audio. That being said, users are free to
325+ // input text however they please.
326+ string text = 1 ;
327+ }
328+ }
329+
330+ // Request message for the `StreamingSynthesize` method. Multiple
331+ // `StreamingSynthesizeRequest` messages are sent in one call.
332+ // The first message must contain a `streaming_config` that
333+ // fully specifies the request configuration and must not contain `input`. All
334+ // subsequent messages must only have `input` set.
335+ message StreamingSynthesizeRequest {
336+ // The request to be sent, either a StreamingSynthesizeConfig or
337+ // StreamingSynthesisInput.
338+ oneof streaming_request {
339+ // StreamingSynthesizeConfig to be used in this streaming attempt. Only
340+ // specified in the first message sent in a `StreamingSynthesize` call.
341+ StreamingSynthesizeConfig streaming_config = 1 ;
342+
343+ // Input to synthesize. Specified in all messages but the first in a
344+ // `StreamingSynthesize` call.
345+ StreamingSynthesisInput input = 2 ;
346+ }
347+ }
348+
349+ // `StreamingSynthesizeResponse` is the only message returned to the
350+ // client by `StreamingSynthesize` method. A series of zero or more
351+ // `StreamingSynthesizeResponse` messages are streamed back to the client.
352+ message StreamingSynthesizeResponse {
353+ // The audio data bytes encoded as specified in the request. This is
354+ // headerless LINEAR16 audio with a sample rate of 24000.
355+ bytes audio_content = 1 ;
356+ }
0 commit comments