feat: Support MULAW audio encoding

Google APIs · copybara-github · commit a94df49e8f20 · 2020-07-27T13:06:13.000-07:00
feat: Support MP3_64_KBPS audio encoding
feat: Support timepointing via SSML &lt;mark&gt; tag

PiperOrigin-RevId: 323424211
diff --git a/google/cloud/texttospeech/v1beta1/cloud_tts.proto b/google/cloud/texttospeech/v1beta1/cloud_tts.proto
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-//
 
 syntax = "proto3";
 
@@ -83,7 +82,7 @@ enum SsmlVoiceGender {
   // A female voice.
   FEMALE = 2;
 
-  // A gender-neutral voice.
+  // A gender-neutral voice. This voice is not yet supported.
   NEUTRAL = 3;
 }
 
@@ -100,11 +99,18 @@ enum AudioEncoding {
   // MP3 audio at 32kbps.
   MP3 = 2;
 
+  // MP3 at 64kbps.
+  MP3_64_KBPS = 4;
+
   // Opus encoded audio wrapped in an ogg container. The result will be a
   // file which can be played natively on Android, and in browsers (at least
   // Chrome and Firefox). The quality of the encoding is considerably higher
   // than MP3 while using approximately the same bitrate.
   OGG_OPUS = 3;
+
+  // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
+  // Audio content returned as MULAW also contains a WAV header.
+  MULAW = 5;
 }
 
 // The message returned to the client by the `ListVoices` method.
@@ -132,6 +138,15 @@ message Voice {
 
 // The top-level message sent by the client for the `SynthesizeSpeech` method.
 message SynthesizeSpeechRequest {
+  // The type of timepoint information that is returned in the response.
+  enum TimepointType {
+    // Not specified. No timepoint information will be returned.
+    TIMEPOINT_TYPE_UNSPECIFIED = 0;
+
+    // Timepoint information of `<mark>` tags in SSML input will be returned.
+    SSML_MARK = 1;
+  }
+
   // Required. The Synthesizer requires either plain text or SSML as input.
   SynthesisInput input = 1 [(google.api.field_behavior) = REQUIRED];
 
@@ -140,6 +155,9 @@ message SynthesizeSpeechRequest {
 
   // Required. The configuration of the synthesized audio.
   AudioConfig audio_config = 3 [(google.api.field_behavior) = REQUIRED];
+
+  // Whether and what timepoints should be returned in the response.
+  repeated TimepointType enable_time_pointing = 4;
 }
 
 // Contains text input to be synthesized. Either `text` or `ssml` must be
@@ -251,4 +269,21 @@ message SynthesizeSpeechResponse {
   // with all bytes fields, protobuffers use a pure binary representation,
   // whereas JSON representations use base64.
   bytes audio_content = 1;
+
+  // A link between a position in the original request input and a corresponding
+  // time in the output audio. It's only supported via `<mark>` of SSML input.
+  repeated Timepoint timepoints = 2;
+
+  // The audio metadata of `audio_content`.
+  AudioConfig audio_config = 4;
+}
+
+// This contains a mapping between a certain point in the input text and a
+// corresponding time in the output audio.
+message Timepoint {
+  // Timepoint name as received from the client within `<mark>` tag.
+  string mark_name = 4;
+
+  // Time offset in seconds from the start of the synthesized audio.
+  double time_seconds = 3;
 }
diff --git a/google/cloud/texttospeech/v1beta1/texttospeech_grpc_service_config.json b/google/cloud/texttospeech/v1beta1/texttospeech_grpc_service_config.json
@@ -1,17 +1,8 @@
 {
   "methodConfig": [
     {
-      "name": [
-        {
-          "service": "google.cloud.texttospeech.v1beta1.TextToSpeech",
-          "method": "ListVoices"
-        },
-        {
-          "service": "google.cloud.texttospeech.v1beta1.TextToSpeech",
-          "method": "SynthesizeSpeech"
-        }
-      ],
-      "timeout": "600s",
+      "name": [{ "service": "google.cloud.texttospeech.v1beta1.TextToSpeech" }],
+      "timeout": "300s",
       "retryPolicy": {
         "initialBackoff": "0.100s",
         "maxBackoff": "60s",