@@ -85,6 +85,11 @@ message EvaluateDatasetOperationMetadata {
8585
8686// Response in LRO for EvaluationService.EvaluateDataset.
8787message EvaluateDatasetResponse {
88+ // Output only. Aggregation statistics derived from results of
89+ // EvaluationService.EvaluateDataset.
90+ AggregationOutput aggregation_output = 1
91+ [(google.api.field_behavior ) = OUTPUT_ONLY ];
92+
8893 // Output only. Output info for EvaluationService.EvaluateDataset.
8994 OutputInfo output_info = 3 [(google.api.field_behavior ) = OUTPUT_ONLY ];
9095}
@@ -99,6 +104,39 @@ message OutputInfo {
99104 }
100105}
101106
107+ // The aggregation result for the entire dataset and all metrics.
108+ message AggregationOutput {
109+ // The dataset used for evaluation & aggregation.
110+ EvaluationDataset dataset = 1 ;
111+
112+ // One AggregationResult per metric.
113+ repeated AggregationResult aggregation_results = 2 ;
114+ }
115+
116+ // The aggregation result for a single metric.
117+ message AggregationResult {
118+ // The aggregation result.
119+ oneof aggregation_result {
120+ // Result for pointwise metric.
121+ PointwiseMetricResult pointwise_metric_result = 5 ;
122+
123+ // Result for pairwise metric.
124+ PairwiseMetricResult pairwise_metric_result = 6 ;
125+
126+ // Results for exact match metric.
127+ ExactMatchMetricValue exact_match_metric_value = 7 ;
128+
129+ // Results for bleu metric.
130+ BleuMetricValue bleu_metric_value = 8 ;
131+
132+ // Results for rouge metric.
133+ RougeMetricValue rouge_metric_value = 9 ;
134+ }
135+
136+ // Aggregation metric.
137+ Metric.AggregationMetric aggregation_metric = 4 ;
138+ }
139+
102140// Request message for EvaluationService.EvaluateDataset.
103141message EvaluateDatasetRequest {
104142 // Required. The resource name of the Location to evaluate the dataset.
@@ -141,34 +179,34 @@ message Metric {
141179 // Unspecified aggregation metric.
142180 AGGREGATION_METRIC_UNSPECIFIED = 0 ;
143181
144- // Average aggregation metric.
182+ // Average aggregation metric. Not supported for Pairwise metric.
145183 AVERAGE = 1 ;
146184
147185 // Mode aggregation metric.
148186 MODE = 2 ;
149187
150- // Standard deviation aggregation metric.
188+ // Standard deviation aggregation metric. Not supported for pairwise metric.
151189 STANDARD_DEVIATION = 3 ;
152190
153- // Variance aggregation metric.
191+ // Variance aggregation metric. Not supported for pairwise metric.
154192 VARIANCE = 4 ;
155193
156- // Minimum aggregation metric.
194+ // Minimum aggregation metric. Not supported for pairwise metric.
157195 MINIMUM = 5 ;
158196
159- // Maximum aggregation metric.
197+ // Maximum aggregation metric. Not supported for pairwise metric.
160198 MAXIMUM = 6 ;
161199
162- // Median aggregation metric.
200+ // Median aggregation metric. Not supported for pairwise metric.
163201 MEDIAN = 7 ;
164202
165- // 90th percentile aggregation metric.
203+ // 90th percentile aggregation metric. Not supported for pairwise metric.
166204 PERCENTILE_P90 = 8 ;
167205
168- // 95th percentile aggregation metric.
206+ // 95th percentile aggregation metric. Not supported for pairwise metric.
169207 PERCENTILE_P95 = 9 ;
170208
171- // 99th percentile aggregation metric.
209+ // 99th percentile aggregation metric. Not supported for pairwise metric.
172210 PERCENTILE_P99 = 10 ;
173211 }
174212
@@ -216,9 +254,9 @@ message AutoraterConfig {
216254 // is 32.
217255 optional int32 sampling_count = 1 [(google.api.field_behavior ) = OPTIONAL ];
218256
219- // Optional. Whether to flip the candidate and baseline responses.
220- // This is only applicable to the pairwise metric. If enabled, also provide
221- // PairwiseMetricSpec.candidate_response_field_name and
257+ // Optional. Default is true. Whether to flip the candidate and baseline
258+ // responses. This is only applicable to the pairwise metric. If enabled, also
259+ // provide PairwiseMetricSpec.candidate_response_field_name and
222260 // PairwiseMetricSpec.baseline_response_field_name. When rendering
223261 // PairwiseMetricSpec.metric_prompt_template, the candidate and baseline
224262 // fields will be flipped for half of the samples to reduce bias.
0 commit comments