Skip to content

Commit 0bc9633

Browse files
committed
runtime/v2: net.Dial gRPC shim sockets before trying grpc
This is mostly to workaround an issue with gRPC based shims after containerd restart. If a shim dies while containerd is also down/restarting, on reboot grpc.DialContext with our current set of DialOptions will make us wait for 100 seconds per shim even if the socket no longer exists or has no listener. Signed-off-by: Danny Canter <[email protected]>
1 parent 8459273 commit 0bc9633

1 file changed

Lines changed: 23 additions & 3 deletions

File tree

runtime/v2/shim.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@ import (
2222
"errors"
2323
"fmt"
2424
"io"
25+
"net"
2526
"os"
2627
"path/filepath"
2728
"strings"
2829
"time"
2930

3031
"github.com/containerd/containerd/v2/pkg/atomicfile"
32+
"github.com/containerd/containerd/v2/pkg/dialer"
3133
"github.com/containerd/ttrpc"
3234
"google.golang.org/grpc"
3335
"google.golang.org/grpc/connectivity"
@@ -39,7 +41,6 @@ import (
3941
"github.com/containerd/containerd/v2/errdefs"
4042
"github.com/containerd/containerd/v2/events/exchange"
4143
"github.com/containerd/containerd/v2/identifiers"
42-
"github.com/containerd/containerd/v2/pkg/dialer"
4344
"github.com/containerd/containerd/v2/pkg/timeout"
4445
"github.com/containerd/containerd/v2/protobuf"
4546
ptypes "github.com/containerd/containerd/v2/protobuf/types"
@@ -275,7 +276,7 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam
275276
grpc.WithTransportCredentials(insecure.NewCredentials()),
276277
grpc.WithBlock(),
277278
}
278-
return grpcDialContext(ctx, dialer.DialAddress(params.Address), onClose, gopts...)
279+
return grpcDialContext(ctx, params.Address, onClose, gopts...)
279280
default:
280281
return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol)
281282
}
@@ -286,10 +287,29 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam
286287
// a callback run when the connection is severed or explicitly closed.
287288
func grpcDialContext(
288289
ctx context.Context,
289-
target string,
290+
address string,
290291
onClose func(),
291292
gopts ...grpc.DialOption,
292293
) (*grpcConn, error) {
294+
// If grpc.WithBlock is specified in gopts this causes the connection to block waiting for
295+
// a connection regardless of if the socket exists or has a listener when Dial begins. This
296+
// specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't
297+
// there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is
298+
// also undesirable as we don't want the background connection behavior, we want to ensure
299+
// a connection before moving on. To bring this in line with the ttrpc connection behavior
300+
// lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice
301+
// here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return
302+
// ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout.
303+
//
304+
// This is especially helpful on restart of containerd as if the shim died while containerd
305+
// was down, we end up waiting the full timeout.
306+
conn, err := net.DialTimeout("unix", address, time.Second*10)
307+
if err != nil {
308+
return nil, err
309+
}
310+
conn.Close()
311+
312+
target := dialer.DialAddress(address)
293313
client, err := grpc.DialContext(ctx, target, gopts...)
294314
if err != nil {
295315
return nil, fmt.Errorf("failed to create GRPC connection: %w", err)

0 commit comments

Comments
 (0)