Skip to content

Commit a94df49

Browse files
Google APIscopybara-github
authored andcommitted
feat: Support MULAW audio encoding
feat: Support MP3_64_KBPS audio encoding feat: Support timepointing via SSML <mark> tag PiperOrigin-RevId: 323424211
1 parent ca22cfe commit a94df49

2 files changed

Lines changed: 39 additions & 13 deletions

File tree

google/cloud/texttospeech/v1beta1/cloud_tts.proto

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
14-
//
1514

1615
syntax = "proto3";
1716

@@ -83,7 +82,7 @@ enum SsmlVoiceGender {
8382
// A female voice.
8483
FEMALE = 2;
8584

86-
// A gender-neutral voice.
85+
// A gender-neutral voice. This voice is not yet supported.
8786
NEUTRAL = 3;
8887
}
8988

@@ -100,11 +99,18 @@ enum AudioEncoding {
10099
// MP3 audio at 32kbps.
101100
MP3 = 2;
102101

102+
// MP3 at 64kbps.
103+
MP3_64_KBPS = 4;
104+
103105
// Opus encoded audio wrapped in an ogg container. The result will be a
104106
// file which can be played natively on Android, and in browsers (at least
105107
// Chrome and Firefox). The quality of the encoding is considerably higher
106108
// than MP3 while using approximately the same bitrate.
107109
OGG_OPUS = 3;
110+
111+
// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
112+
// Audio content returned as MULAW also contains a WAV header.
113+
MULAW = 5;
108114
}
109115

110116
// The message returned to the client by the `ListVoices` method.
@@ -132,6 +138,15 @@ message Voice {
132138

133139
// The top-level message sent by the client for the `SynthesizeSpeech` method.
134140
message SynthesizeSpeechRequest {
141+
// The type of timepoint information that is returned in the response.
142+
enum TimepointType {
143+
// Not specified. No timepoint information will be returned.
144+
TIMEPOINT_TYPE_UNSPECIFIED = 0;
145+
146+
// Timepoint information of `<mark>` tags in SSML input will be returned.
147+
SSML_MARK = 1;
148+
}
149+
135150
// Required. The Synthesizer requires either plain text or SSML as input.
136151
SynthesisInput input = 1 [(google.api.field_behavior) = REQUIRED];
137152

@@ -140,6 +155,9 @@ message SynthesizeSpeechRequest {
140155

141156
// Required. The configuration of the synthesized audio.
142157
AudioConfig audio_config = 3 [(google.api.field_behavior) = REQUIRED];
158+
159+
// Whether and what timepoints should be returned in the response.
160+
repeated TimepointType enable_time_pointing = 4;
143161
}
144162

145163
// Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -251,4 +269,21 @@ message SynthesizeSpeechResponse {
251269
// with all bytes fields, protobuffers use a pure binary representation,
252270
// whereas JSON representations use base64.
253271
bytes audio_content = 1;
272+
273+
// A link between a position in the original request input and a corresponding
274+
// time in the output audio. It's only supported via `<mark>` of SSML input.
275+
repeated Timepoint timepoints = 2;
276+
277+
// The audio metadata of `audio_content`.
278+
AudioConfig audio_config = 4;
279+
}
280+
281+
// This contains a mapping between a certain point in the input text and a
282+
// corresponding time in the output audio.
283+
message Timepoint {
284+
// Timepoint name as received from the client within `<mark>` tag.
285+
string mark_name = 4;
286+
287+
// Time offset in seconds from the start of the synthesized audio.
288+
double time_seconds = 3;
254289
}

google/cloud/texttospeech/v1beta1/texttospeech_grpc_service_config.json

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,8 @@
11
{
22
"methodConfig": [
33
{
4-
"name": [
5-
{
6-
"service": "google.cloud.texttospeech.v1beta1.TextToSpeech",
7-
"method": "ListVoices"
8-
},
9-
{
10-
"service": "google.cloud.texttospeech.v1beta1.TextToSpeech",
11-
"method": "SynthesizeSpeech"
12-
}
13-
],
14-
"timeout": "600s",
4+
"name": [{ "service": "google.cloud.texttospeech.v1beta1.TextToSpeech" }],
5+
"timeout": "300s",
156
"retryPolicy": {
167
"initialBackoff": "0.100s",
178
"maxBackoff": "60s",

0 commit comments

Comments
 (0)