Skip to content

Commit 5122617

Browse files
werktcopybara-github
authored andcommitted
Status error presentation with details
Remote Execution Status messages embedded in ExecuteResponses are extremely capable vehicles for conveying the nature of an error, and informing a user of further steps to take to remediate it. This change expands the presentation of these response Statuses, and brings all of the error details to light, by default instead of requiring --verbose_failures to investigate any details of a remote execution problem. The interpretation of precondition failures to highlight retriable responses has been expanded to ignore benign details that might be included in a response. SpawnResult error message composition has been simplified substantially, without any special behavior for 'Remote' errors, and a removal of a duplicate message printout incurred in the wake of succcessive @janakr and @olaola changes. Failure messages are now implied to be present in all spawn result failure reporting exactly once, and the failureMessage of a SpawnResult is implied to be the parameter to getDetailMessage. An example error presentation is as follows (including the modifications to SpawnResult's output formatting): ``` ERROR: /home/werkt/dev/test/BUILD:22:10: Linking test failed: (Exit 34): Remote Execution Failure: Failed Precondition: Action 4223ab2cc114385110714243a0b4a88cc743f2169b5be7d4d438a6bbba4f529f/142 is invalid Resource Info: type.googleapis.com/google.longrunning.Operation: name='shard/operations/9335fef2-184b-4d26-9a6f-2f27cebe7527', owner='tool_invocation_id:4b4bf7b1-fadd-44fd-99be-a234e7c26fc4,correlated_invocation_id:dc88325a-9317-48c0-9013-b3bb8b7a458f' Precondition Failure: (MISSING) bazel-out/k8-fastbuild/bin/test: 7872: An output could not be uploaded because it exceeded the maximum size of an entry Target //:test failed to build ``` Closes bazelbuild#12564. PiperOrigin-RevId: 3449738
1 parent 12b655b commit 5122617

13 files changed

Lines changed: 382 additions & 95 deletions

File tree

src/main/java/com/google/devtools/build/lib/actions/SpawnResult.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -385,11 +385,6 @@ public String getDetailMessage(
385385
String reason = "(" + status.toShortString() + ")"; // e.g "(Exit 1)"
386386
String explanation = Strings.isNullOrEmpty(message) ? "" : ": " + message;
387387

388-
if (!status().isConsideredUserError()) {
389-
String errorDetail = status().name().toLowerCase(Locale.US)
390-
.replace('_', ' ');
391-
explanation += ". Note: Remote connection/protocol failed with: " + errorDetail;
392-
}
393388
if (status() == Status.TIMEOUT) {
394389
if (getWallTime().isPresent()) {
395390
explanation +=
@@ -407,9 +402,6 @@ public String getDetailMessage(
407402
explanation += " Action tagged as local was forcibly run remotely and failed - it's "
408403
+ "possible that the action simply doesn't work remotely";
409404
}
410-
if (!Strings.isNullOrEmpty(failureMessage)) {
411-
explanation += " " + failureMessage;
412-
}
413405
return reason + explanation;
414406
}
415407

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// Copyright 2020 The Bazel Authors. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package com.google.devtools.build.lib.remote;
16+
17+
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
18+
import com.google.protobuf.Any;
19+
import com.google.protobuf.InvalidProtocolBufferException;
20+
import com.google.protobuf.util.Durations;
21+
import com.google.rpc.DebugInfo;
22+
import com.google.rpc.Help;
23+
import com.google.rpc.LocalizedMessage;
24+
import com.google.rpc.PreconditionFailure;
25+
import com.google.rpc.PreconditionFailure.Violation;
26+
import com.google.rpc.RequestInfo;
27+
import com.google.rpc.ResourceInfo;
28+
import com.google.rpc.RetryInfo;
29+
import com.google.rpc.Status;
30+
import io.grpc.Status.Code;
31+
import io.grpc.protobuf.StatusProto;
32+
33+
/** Specific retry logic for execute request with gapi Status. */
34+
class ExecuteRetrier extends RemoteRetrier {
35+
36+
private static final String VIOLATION_TYPE_MISSING = "MISSING";
37+
38+
private static class RetryInfoBackoff implements Backoff {
39+
private final int maxRetryAttempts;
40+
int retries = 0;
41+
42+
RetryInfoBackoff(int maxRetryAttempts) {
43+
this.maxRetryAttempts = maxRetryAttempts;
44+
}
45+
46+
@Override
47+
public long nextDelayMillis(Exception e) {
48+
if (retries >= maxRetryAttempts) {
49+
return -1;
50+
}
51+
RetryInfo retryInfo = getRetryInfo(e);
52+
retries++;
53+
return Durations.toMillis(retryInfo.getRetryDelay());
54+
}
55+
56+
RetryInfo getRetryInfo(Exception e) {
57+
RetryInfo retryInfo = RetryInfo.getDefaultInstance();
58+
Status status = StatusProto.fromThrowable(e);
59+
if (status != null) {
60+
for (Any detail : status.getDetailsList()) {
61+
if (detail.is(RetryInfo.class)) {
62+
try {
63+
retryInfo = detail.unpack(RetryInfo.class);
64+
} catch (InvalidProtocolBufferException protoEx) {
65+
// really shouldn't happen, ignore
66+
}
67+
}
68+
}
69+
}
70+
return retryInfo;
71+
}
72+
73+
@Override
74+
public int getRetryAttempts() {
75+
return retries;
76+
}
77+
}
78+
79+
ExecuteRetrier(
80+
int maxRetryAttempts,
81+
ListeningScheduledExecutorService retryService,
82+
CircuitBreaker circuitBreaker) {
83+
super(
84+
() -> maxRetryAttempts > 0 ? new RetryInfoBackoff(maxRetryAttempts) : RETRIES_DISABLED,
85+
ExecuteRetrier::shouldRetry,
86+
retryService,
87+
circuitBreaker);
88+
}
89+
90+
private static boolean shouldRetry(Exception e) {
91+
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
92+
return true;
93+
}
94+
Status status = StatusProto.fromThrowable(e);
95+
if (status == null || status.getDetailsCount() == 0) {
96+
return false;
97+
}
98+
boolean failedPrecondition = status.getCode() == Code.FAILED_PRECONDITION.value();
99+
for (Any detail : status.getDetailsList()) {
100+
if (detail.is(RetryInfo.class)) {
101+
// server says we can retry, regardless of other details
102+
return true;
103+
} else if (failedPrecondition) {
104+
if (detail.is(PreconditionFailure.class)) {
105+
try {
106+
PreconditionFailure f = detail.unpack(PreconditionFailure.class);
107+
if (f.getViolationsCount() == 0) {
108+
failedPrecondition = false;
109+
}
110+
for (Violation v : f.getViolationsList()) {
111+
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
112+
failedPrecondition = false;
113+
}
114+
}
115+
// if *all* > 0 precondition failure violations have type MISSING, failedPrecondition
116+
// remains true
117+
} catch (InvalidProtocolBufferException protoEx) {
118+
// really shouldn't happen
119+
return false;
120+
}
121+
} else if (!(detail.is(DebugInfo.class)
122+
|| detail.is(Help.class)
123+
|| detail.is(LocalizedMessage.class)
124+
|| detail.is(RequestInfo.class)
125+
|| detail.is(ResourceInfo.class))) { // ignore benign details
126+
// consider all other details as failures
127+
failedPrecondition = false;
128+
}
129+
}
130+
}
131+
return failedPrecondition;
132+
}
133+
}

src/main/java/com/google/devtools/build/lib/remote/ExecutionStatusException.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,8 @@ public boolean isExecutionTimeout() {
6363
public ExecuteResponse getResponse() {
6464
return response;
6565
}
66+
67+
public Status getOriginalStatus() {
68+
return status;
69+
}
6670
}

src/main/java/com/google/devtools/build/lib/remote/ExperimentalGrpcRemoteExecutor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ ExecuteResponse waitExecution() throws IOException {
194194
//
195195
// However, we only retry Execute() if executeBackoff should retry. Also increase the retry
196196
// counter at the same time (done by nextDelayMillis()).
197-
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis() >= 0) {
197+
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis(e) >= 0) {
198198
lastOperation = null;
199199
return null;
200200
}

src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ public ExponentialBackoff(RemoteOptions options) {
174174
}
175175

176176
@Override
177-
public long nextDelayMillis() {
177+
public long nextDelayMillis(Exception e) {
178178
if (attempts == maxAttempts) {
179179
return -1;
180180
}
@@ -221,13 +221,13 @@ public void reset() {
221221
}
222222

223223
@Override
224-
public long nextDelayMillis() {
224+
public long nextDelayMillis(Exception e) {
225225
if (currentBackoff == null) {
226226
currentBackoff = backoffSupplier.get();
227227
retries++;
228228
return 0;
229229
}
230-
return currentBackoff.nextDelayMillis();
230+
return currentBackoff.nextDelayMillis(e);
231231
}
232232

233233
@Override

src/main/java/com/google/devtools/build/lib/remote/RemoteSpawnRunner.java

Lines changed: 5 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,12 @@
8585
import com.google.devtools.build.lib.vfs.Path;
8686
import com.google.devtools.build.lib.vfs.PathFragment;
8787
import com.google.longrunning.Operation;
88-
import com.google.protobuf.Any;
89-
import com.google.protobuf.InvalidProtocolBufferException;
9088
import com.google.protobuf.Message;
9189
import com.google.protobuf.Timestamp;
9290
import com.google.protobuf.util.Durations;
9391
import com.google.protobuf.util.Timestamps;
94-
import com.google.rpc.PreconditionFailure;
95-
import com.google.rpc.PreconditionFailure.Violation;
9692
import io.grpc.Context;
9793
import io.grpc.Status.Code;
98-
import io.grpc.protobuf.StatusProto;
9994
import java.io.IOException;
10095
import java.time.Duration;
10196
import java.util.ArrayList;
@@ -114,38 +109,6 @@
114109
@ThreadSafe
115110
public class RemoteSpawnRunner implements SpawnRunner {
116111

117-
private static final String VIOLATION_TYPE_MISSING = "MISSING";
118-
119-
private static boolean retriableExecErrors(Exception e) {
120-
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
121-
return true;
122-
}
123-
if (!RemoteRetrierUtils.causedByStatus(e, Code.FAILED_PRECONDITION)) {
124-
return false;
125-
}
126-
com.google.rpc.Status status = StatusProto.fromThrowable(e);
127-
if (status == null || status.getDetailsCount() == 0) {
128-
return false;
129-
}
130-
for (Any details : status.getDetailsList()) {
131-
PreconditionFailure f;
132-
try {
133-
f = details.unpack(PreconditionFailure.class);
134-
} catch (InvalidProtocolBufferException protoEx) {
135-
return false;
136-
}
137-
if (f.getViolationsCount() == 0) {
138-
return false; // Generally shouldn't happen
139-
}
140-
for (Violation v : f.getViolationsList()) {
141-
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
142-
return false;
143-
}
144-
}
145-
}
146-
return true; // if *all* > 0 violations have type MISSING
147-
}
148-
149112
private final Path execRoot;
150113
private final RemoteOptions remoteOptions;
151114
private final ExecutionOptions executionOptions;
@@ -656,12 +619,10 @@ private SpawnResult handleError(
656619
catastrophe = false;
657620
}
658621

659-
final String errorMessage;
660-
if (!verboseFailures) {
661-
errorMessage = Utils.grpcAwareErrorMessage(exception);
662-
} else {
622+
String errorMessage = Utils.grpcAwareErrorMessage(exception);
623+
if (verboseFailures) {
663624
// On --verbose_failures print the whole stack trace
664-
errorMessage = Throwables.getStackTraceAsString(exception);
625+
errorMessage += "\n" + Throwables.getStackTraceAsString(exception);
665626
}
666627

667628
return new SpawnResult.Builder()
@@ -817,12 +778,7 @@ static Collection<Path> resolveActionInputs(
817778

818779
private static RemoteRetrier createExecuteRetrier(
819780
RemoteOptions options, ListeningScheduledExecutorService retryService) {
820-
return new RemoteRetrier(
821-
options.remoteMaxRetryAttempts > 0
822-
? () -> new Retrier.ZeroBackoff(options.remoteMaxRetryAttempts)
823-
: () -> Retrier.RETRIES_DISABLED,
824-
RemoteSpawnRunner::retriableExecErrors,
825-
retryService,
826-
Retrier.ALLOW_ALL_CALLS);
781+
return new ExecuteRetrier(
782+
options.remoteMaxRetryAttempts, retryService, Retrier.ALLOW_ALL_CALLS);
827783
}
828784
}

src/main/java/com/google/devtools/build/lib/remote/Retrier.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,11 @@ public interface Backoff {
4747
* Returns the next delay in milliseconds, or a value less than {@code 0} if we should stop
4848
* retrying.
4949
*/
50-
long nextDelayMillis();
50+
long nextDelayMillis(Exception e);
5151

5252
/**
53-
* Returns the number of calls to {@link #nextDelayMillis()} thus far, not counting any calls
54-
* that returned less than {@code 0}.
53+
* Returns the number of calls to {@link #nextDelayMillis(Exception)} thus far, not counting any
54+
* calls that returned less than {@code 0}.
5555
*/
5656
int getRetryAttempts();
5757
}
@@ -140,7 +140,7 @@ public void recordSuccess() {}
140140
public static final Backoff RETRIES_DISABLED =
141141
new Backoff() {
142142
@Override
143-
public long nextDelayMillis() {
143+
public long nextDelayMillis(Exception e) {
144144
return -1;
145145
}
146146

@@ -161,7 +161,7 @@ public ZeroBackoff(int maxRetries) {
161161
}
162162

163163
@Override
164-
public long nextDelayMillis() {
164+
public long nextDelayMillis(Exception e) {
165165
if (retries >= maxRetries) {
166166
return -1;
167167
}
@@ -253,7 +253,7 @@ public <T> T execute(Callable<T> call, Backoff backoff) throws Exception {
253253
if (!shouldRetry.test(e)) {
254254
throw e;
255255
}
256-
final long delayMillis = backoff.nextDelayMillis();
256+
final long delayMillis = backoff.nextDelayMillis(e);
257257
if (delayMillis < 0) {
258258
throw e;
259259
}
@@ -286,7 +286,7 @@ public <T> ListenableFuture<T> executeAsync(AsyncCallable<T> call, Backoff backo
286286
private <T> ListenableFuture<T> onExecuteAsyncFailure(
287287
Exception t, AsyncCallable<T> call, Backoff backoff) {
288288
if (isRetriable(t)) {
289-
long waitMillis = backoff.nextDelayMillis();
289+
long waitMillis = backoff.nextDelayMillis(t);
290290
if (waitMillis >= 0) {
291291
try {
292292
return Futures.scheduleAsync(

src/main/java/com/google/devtools/build/lib/remote/util/BUILD

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,20 @@ java_library(
2020
"//src/main/java/com/google/devtools/build/lib/analysis:blaze_version_info",
2121
"//src/main/java/com/google/devtools/build/lib/authandtls",
2222
"//src/main/java/com/google/devtools/build/lib/concurrent",
23+
"//src/main/java/com/google/devtools/build/lib/remote:ExecutionStatusException",
2324
"//src/main/java/com/google/devtools/build/lib/remote/common",
2425
"//src/main/java/com/google/devtools/build/lib/remote/options",
2526
"//src/main/java/com/google/devtools/build/lib/vfs",
2627
"//src/main/java/com/google/devtools/build/lib/vfs:pathfragment",
2728
"//src/main/protobuf:failure_details_java_proto",
28-
"//third_party:flogger",
2929
"//third_party:guava",
3030
"//third_party:jsr305",
3131
"//third_party/grpc:grpc-jar",
3232
"//third_party/protobuf:protobuf_java",
33+
"//third_party/protobuf:protobuf_java_util",
34+
"@googleapis//:google_rpc_code_java_proto",
35+
"@googleapis//:google_rpc_error_details_java_proto",
36+
"@googleapis//:google_rpc_status_java_proto",
3337
"@remoteapis//:build_bazel_remote_execution_v2_remote_execution_java_grpc",
3438
"@remoteapis//:build_bazel_remote_execution_v2_remote_execution_java_proto",
3539
],

0 commit comments

Comments
 (0)