feat: Add global quota config to vertex rag engine api

Google APIs · copybara-github · commit e09604101b0a · 2025-04-07T16:33:24.000-07:00
PiperOrigin-RevId: 744892201
diff --git a/google/cloud/aiplatform/v1beta1/vertex_rag_data.proto b/google/cloud/aiplatform/v1beta1/vertex_rag_data.proto
@@ -460,6 +460,15 @@ message RagFileParsingConfig {
     // project to set an appropriate value here. If unspecified, a default value
     // of 120 QPM would be used.
     int32 max_parsing_requests_per_min = 2;
+
+    // The maximum number of requests the job is allowed to make to the Document
+    // AI processor per minute in this project. Consult
+    // https://cloud.google.com/document-ai/quotas and the Quota page for your
+    // project to set an appropriate value here.
+    // If this value is not specified,
+    // max_parsing_requests_per_min will be used by indexing
+    // pipeline as the global limit.
+    int32 global_max_parsing_requests_per_min = 3;
   }
 
   // Specifies the advanced parsing for RagFiles.
@@ -476,6 +485,15 @@ message RagFileParsingConfig {
     // a default value of 5000 QPM would be used.
     int32 max_parsing_requests_per_min = 2;
 
+    // The maximum number of requests the job is allowed to make to the
+    // LLM model per minute in this project. Consult
+    // https://cloud.google.com/vertex-ai/generative-ai/docs/quotas
+    // and your document size to set an appropriate value here.
+    // If this value is not specified,
+    // max_parsing_requests_per_min will be used by indexing pipeline job as the
+    // global limit.
+    int32 global_max_parsing_requests_per_min = 4;
+
     // The prompt to use for parsing. If not specified, a default prompt will
     // be used.
     string custom_parsing_prompt = 3;
@@ -580,4 +598,13 @@ message ImportRagFilesConfig {
   // If unspecified, a default value of 1,000 QPM would be used.
   int32 max_embedding_requests_per_min = 5
       [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The max number of queries per minute that the indexing pipeline
+  // job is allowed to make to the embedding model specified in the project.
+  // Please follow the quota usage guideline of the embedding model you use to
+  // set the value properly. If this value is not specified,
+  // max_embedding_requests_per_min will be used by indexing pipeline job as the
+  // global limit.
+  int32 global_max_embedding_requests_per_min = 18
+      [(google.api.field_behavior) = OPTIONAL];
 }