Skip to content

Commit f7658e3

Browse files
committed
runtime: should fail fast if dial error on shim
In linux platform, the shim server always listens on the socket before the containerd task manager dial it. It is unlikely that containerd task manager should handle reconnect because the shim can't restart. For this case, the containerd task manager should fail fast if there is ENOENT or ECONNREFUSED error. And if the socket file is deleted during cleanup the exited task, it maybe cause that containerd task manager takes long time to reload the dead shim. For that task.v2 manager, the race case is like: ``` TaskService.Delete TaskManager.Delete(runtime/v2/manager.go) shim.delete(runtime/v2/shim.go) shimv2api.Shutdown(runtime/v2/task/shim.pb.go) <- containerd has been killed or restarted somehow bundle.Delete ``` The shimv2api.Shutdown will cause that the shim deletes socket file (containerd-shim-runc-v2 does). But the bundle is still there. During reloading, the containerd will wait for the socket file appears again in 100 seconds. It is not reasonable. The Reconnect should prevent this case by fast fail. Closes: containerd#5648. Fixes: containerd#5597. Signed-off-by: Wei Fu <[email protected]>
1 parent c16be1a commit f7658e3

3 files changed

Lines changed: 186 additions & 6 deletions

File tree

integration/shim_dial_unix_test.go

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
//go:build !windows
2+
// +build !windows
3+
4+
/*
5+
Copyright The containerd Authors.
6+
7+
Licensed under the Apache License, Version 2.0 (the "License");
8+
you may not use this file except in compliance with the License.
9+
You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing, software
14+
distributed under the License is distributed on an "AS IS" BASIS,
15+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
See the License for the specific language governing permissions and
17+
limitations under the License.
18+
*/
19+
20+
package integration
21+
22+
import (
23+
"context"
24+
"io/ioutil"
25+
"net"
26+
"os"
27+
"path/filepath"
28+
"strings"
29+
"syscall"
30+
"testing"
31+
"time"
32+
33+
v1shimcli "github.com/containerd/containerd/runtime/v1/shim/client"
34+
v2shimcli "github.com/containerd/containerd/runtime/v2/shim"
35+
"github.com/containerd/ttrpc"
36+
"github.com/pkg/errors"
37+
)
38+
39+
const abstractSocketPrefix = "\x00"
40+
41+
// TestFailFastWhenConnectShim is to test that the containerd task manager
42+
// should not tolerate ENOENT during restarting. In linux, the containerd shim
43+
// always listens on socket before task manager dial. If there is ENOENT or
44+
// ECONNREFUSED error, the task manager should clean up because that socket file
45+
// is gone or shim doesn't listen on the socket anymore.
46+
func TestFailFastWhenConnectShim(t *testing.T) {
47+
t.Parallel()
48+
49+
t.Run("abstract-unix-socket-v1", testFailFastWhenConnectShim(true, v1shimcli.AnonDialer))
50+
t.Run("abstract-unix-socket-v2", testFailFastWhenConnectShim(true, v2shimcli.AnonDialer))
51+
t.Run("normal-unix-socket-v1", testFailFastWhenConnectShim(false, v1shimcli.AnonDialer))
52+
t.Run("normal-unix-socket-v2", testFailFastWhenConnectShim(false, v2shimcli.AnonDialer))
53+
}
54+
55+
type dialFunc func(address string, timeout time.Duration) (net.Conn, error)
56+
57+
func testFailFastWhenConnectShim(abstract bool, dialFn dialFunc) func(*testing.T) {
58+
return func(t *testing.T) {
59+
var (
60+
ctx = context.Background()
61+
addr, listener, cleanup = newTestListener(t, abstract)
62+
errCh = make(chan error, 1)
63+
64+
checkDialErr = func(addr string, errCh chan error, expected error) {
65+
go func() {
66+
_, err := dialFn(addr, 1*time.Hour)
67+
errCh <- err
68+
}()
69+
70+
select {
71+
case <-time.After(10 * time.Second):
72+
t.Fatalf("expected fail fast, but got timeout")
73+
case err := <-errCh:
74+
t.Helper()
75+
if !errors.Is(err, expected) {
76+
t.Fatalf("expected error %v, but got %v", expected, err)
77+
}
78+
}
79+
}
80+
)
81+
defer cleanup()
82+
defer listener.Close()
83+
84+
ttrpcSrv, err := ttrpc.NewServer()
85+
if err != nil {
86+
t.Fatalf("failed to new ttrpc server: %v", err)
87+
}
88+
go func() {
89+
ttrpcSrv.Serve(ctx, listener)
90+
}()
91+
92+
// ttrpcSrv starts in other goroutine so that we need to retry AnonDialer
93+
// here until ttrpcSrv receives the request.
94+
go func() {
95+
to := time.After(10 * time.Second)
96+
97+
for {
98+
select {
99+
case <-to:
100+
errCh <- errors.New("timeout")
101+
return
102+
default:
103+
}
104+
105+
conn, err := dialFn(addr, 1*time.Hour)
106+
if err != nil {
107+
if errors.Is(err, syscall.ECONNREFUSED) {
108+
time.Sleep(10 * time.Millisecond)
109+
continue
110+
}
111+
errCh <- err
112+
return
113+
}
114+
115+
conn.Close()
116+
errCh <- nil
117+
return
118+
}
119+
}()
120+
121+
// it should be successful
122+
if err := <-errCh; err != nil {
123+
t.Fatalf("failed to dial: %v", err)
124+
}
125+
126+
// NOTE(fuweid):
127+
//
128+
// UnixListener will unlink that the socket file when call Close.
129+
// Disable unlink when close to keep the socket file.
130+
listener.(*net.UnixListener).SetUnlinkOnClose(false)
131+
132+
listener.Close()
133+
ttrpcSrv.Shutdown(ctx)
134+
135+
checkDialErr(addr, errCh, syscall.ECONNREFUSED)
136+
137+
// remove the socket file
138+
cleanup()
139+
140+
if abstract {
141+
checkDialErr(addr, errCh, syscall.ECONNREFUSED)
142+
} else {
143+
// should not wait for the socket file show up again.
144+
checkDialErr(addr, errCh, syscall.ENOENT)
145+
}
146+
}
147+
}
148+
149+
func newTestListener(t testing.TB, abstract bool) (string, net.Listener, func()) {
150+
tmpDir, err := ioutil.TempDir("", "shim-ut-XX")
151+
if err != nil {
152+
t.Fatalf("failed to create tmp directory: %v", err)
153+
}
154+
155+
// NOTE(fuweid):
156+
//
157+
// Before patch https://github.com/containerd/containerd/commit/bd908acabd1a31c8329570b5283e8fdca0b39906,
158+
// The shim stores the abstract socket file without abstract socket
159+
// prefix and `unix://`. For the existing shim, if the socket file
160+
// only contains the path, it will indicate that it is abstract socket.
161+
// Otherwise, it will be normal socket file formated in `unix:///xyz'.
162+
addr := filepath.Join(tmpDir, "uds.socket")
163+
if abstract {
164+
addr = abstractSocketPrefix + addr
165+
} else {
166+
addr = "unix://" + addr
167+
}
168+
169+
listener, err := net.Listen("unix", strings.TrimPrefix(addr, "unix://"))
170+
if err != nil {
171+
t.Fatalf("failed to listen on %s: %v", addr, err)
172+
}
173+
174+
return strings.TrimPrefix(addr, abstractSocketPrefix), listener, func() {
175+
os.RemoveAll(tmpDir)
176+
}
177+
}

runtime/v1/shim/client/client.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ import (
3535

3636
"github.com/containerd/containerd/events"
3737
"github.com/containerd/containerd/log"
38-
"github.com/containerd/containerd/pkg/dialer"
3938
v1 "github.com/containerd/containerd/runtime/v1"
4039
"github.com/containerd/containerd/runtime/v1/shim"
4140
shimapi "github.com/containerd/containerd/runtime/v1/shim/v1"
@@ -299,12 +298,19 @@ func RemoveSocket(address string) error {
299298
return nil
300299
}
301300

301+
// AnonDialer returns a dialer for a socket
302+
//
303+
// NOTE: It is only used for testing.
304+
func AnonDialer(address string, timeout time.Duration) (net.Conn, error) {
305+
return anonDialer(address, timeout)
306+
}
307+
302308
func connect(address string, d func(string, time.Duration) (net.Conn, error)) (net.Conn, error) {
303309
return d(address, 100*time.Second)
304310
}
305311

306312
func anonDialer(address string, timeout time.Duration) (net.Conn, error) {
307-
return dialer.Dialer(socket(address).path(), timeout)
313+
return net.DialTimeout("unix", socket(address).path(), timeout)
308314
}
309315

310316
// WithConnect connects to an existing shim

runtime/v2/shim/util_unix.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ import (
3232

3333
"github.com/containerd/containerd/defaults"
3434
"github.com/containerd/containerd/namespaces"
35-
"github.com/containerd/containerd/pkg/dialer"
3635
"github.com/containerd/containerd/sys"
3736
"github.com/pkg/errors"
3837
)
@@ -78,9 +77,7 @@ func SocketAddress(ctx context.Context, socketPath, id string) (string, error) {
7877

7978
// AnonDialer returns a dialer for a socket
8079
func AnonDialer(address string, timeout time.Duration) (net.Conn, error) {
81-
ctx, cancel := context.WithTimeout(context.Background(), timeout)
82-
defer cancel()
83-
return dialer.ContextDialer(ctx, socket(address).path())
80+
return net.DialTimeout("unix", socket(address).path(), timeout)
8481
}
8582

8683
// AnonReconnectDialer returns a dialer for an existing socket on reconnection

0 commit comments

Comments
 (0)