@@ -22,12 +22,14 @@ import (
2222 "errors"
2323 "fmt"
2424 "io"
25+ "net"
2526 "os"
2627 "path/filepath"
2728 "strings"
2829 "time"
2930
3031 "github.com/containerd/containerd/v2/pkg/atomicfile"
32+ "github.com/containerd/containerd/v2/pkg/dialer"
3133 "github.com/containerd/ttrpc"
3234 "google.golang.org/grpc"
3335 "google.golang.org/grpc/connectivity"
@@ -39,7 +41,6 @@ import (
3941 "github.com/containerd/containerd/v2/errdefs"
4042 "github.com/containerd/containerd/v2/events/exchange"
4143 "github.com/containerd/containerd/v2/identifiers"
42- "github.com/containerd/containerd/v2/pkg/dialer"
4344 "github.com/containerd/containerd/v2/pkg/timeout"
4445 "github.com/containerd/containerd/v2/protobuf"
4546 ptypes "github.com/containerd/containerd/v2/protobuf/types"
@@ -275,7 +276,7 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam
275276 grpc .WithTransportCredentials (insecure .NewCredentials ()),
276277 grpc .WithBlock (),
277278 }
278- return grpcDialContext (ctx , dialer . DialAddress ( params .Address ) , onClose , gopts ... )
279+ return grpcDialContext (ctx , params .Address , onClose , gopts ... )
279280 default :
280281 return nil , fmt .Errorf ("unexpected protocol: %q" , params .Protocol )
281282 }
@@ -286,10 +287,29 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam
286287// a callback run when the connection is severed or explicitly closed.
287288func grpcDialContext (
288289 ctx context.Context ,
289- target string ,
290+ address string ,
290291 onClose func (),
291292 gopts ... grpc.DialOption ,
292293) (* grpcConn , error ) {
294+ // If grpc.WithBlock is specified in gopts this causes the connection to block waiting for
295+ // a connection regardless of if the socket exists or has a listener when Dial begins. This
296+ // specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't
297+ // there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is
298+ // also undesirable as we don't want the background connection behavior, we want to ensure
299+ // a connection before moving on. To bring this in line with the ttrpc connection behavior
300+ // lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice
301+ // here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return
302+ // ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout.
303+ //
304+ // This is especially helpful on restart of containerd as if the shim died while containerd
305+ // was down, we end up waiting the full timeout.
306+ conn , err := net .DialTimeout ("unix" , address , time .Second * 10 )
307+ if err != nil {
308+ return nil , err
309+ }
310+ conn .Close ()
311+
312+ target := dialer .DialAddress (address )
293313 client , err := grpc .DialContext (ctx , target , gopts ... )
294314 if err != nil {
295315 return nil , fmt .Errorf ("failed to create GRPC connection: %w" , err )
0 commit comments