Skip to content

Commit 00bf437

Browse files
committed
Add nlutil functions to retry on netlink EINTR
A recent change to the vishvananda/netlink package exposes NLM_F_DUMP_INTR in some netlink responses as an EINTR (with no data). Retry the requests when that happens, up to five times, before returning the error. The limit of five is arbitrary, on most systems a single retry will be rare but, there's no guarantee that a retry will succeed. So, on a very busy or misbehaving system the error may still be returned. In most cases, this will lead to failure of the operation being attempted (which may lead to daemon startup failure, network initialisation failure etc). Signed-off-by: Rob Murray <[email protected]>
1 parent 7156bfa commit 00bf437

25 files changed

Lines changed: 263 additions & 75 deletions

daemon/cluster/listen_addr_linux.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ package cluster // import "github.com/docker/docker/daemon/cluster"
33
import (
44
"net"
55

6+
"github.com/docker/docker/internal/nlwrap"
67
"github.com/vishvananda/netlink"
78
)
89

910
func (c *Cluster) resolveSystemAddr() (net.IP, error) {
1011
// Use the system's only device IP address, or fail if there are
1112
// multiple addresses to choose from.
12-
interfaces, err := netlink.LinkList()
13+
interfaces, err := nlwrap.LinkList()
1314
if err != nil {
1415
return nil, err
1516
}
@@ -26,7 +27,7 @@ func (c *Cluster) resolveSystemAddr() (net.IP, error) {
2627
continue
2728
}
2829

29-
addrs, err := netlink.AddrList(intf, netlink.FAMILY_ALL)
30+
addrs, err := nlwrap.AddrList(intf, netlink.FAMILY_ALL)
3031
if err != nil {
3132
continue
3233
}

daemon/daemon_unix.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"github.com/docker/docker/daemon/config"
2828
"github.com/docker/docker/daemon/initlayer"
2929
"github.com/docker/docker/errdefs"
30+
"github.com/docker/docker/internal/nlwrap"
3031
"github.com/docker/docker/libcontainerd/remote"
3132
"github.com/docker/docker/libnetwork"
3233
nwconfig "github.com/docker/docker/libnetwork/config"
@@ -1065,7 +1066,7 @@ func initBridgeDriver(controller *libnetwork.Controller, cfg config.BridgeConfig
10651066

10661067
// Remove default bridge interface if present (--bridge=none use case)
10671068
func removeDefaultBridgeInterface() {
1068-
if lnk, err := netlink.LinkByName(bridge.DefaultBridgeName); err == nil {
1069+
if lnk, err := nlwrap.LinkByName(bridge.DefaultBridgeName); err == nil {
10691070
if err := netlink.LinkDel(lnk); err != nil {
10701071
log.G(context.TODO()).Warnf("Failed to remove bridge interface (%s): %v", bridge.DefaultBridgeName, err)
10711072
}

integration-cli/docker_cli_network_unix_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/docker/docker/api/types/network"
1919
"github.com/docker/docker/integration-cli/cli"
2020
"github.com/docker/docker/integration-cli/daemon"
21+
"github.com/docker/docker/internal/nlwrap"
2122
"github.com/docker/docker/libnetwork/driverapi"
2223
remoteapi "github.com/docker/docker/libnetwork/drivers/remote/api"
2324
"github.com/docker/docker/libnetwork/ipamapi"
@@ -120,7 +121,7 @@ func setupRemoteNetworkDrivers(t *testing.T, mux *http.ServeMux, url, netDrv, ip
120121

121122
mux.HandleFunc(fmt.Sprintf("/%s.DeleteEndpoint", driverapi.NetworkPluginEndpointType), func(w http.ResponseWriter, r *http.Request) {
122123
w.Header().Set("Content-Type", plugins.VersionMimetype)
123-
if link, err := netlink.LinkByName("cnt0"); err == nil {
124+
if link, err := nlwrap.LinkByName("cnt0"); err == nil {
124125
err = netlink.LinkDel(link)
125126
assert.NilError(t, err)
126127
}
@@ -1778,7 +1779,7 @@ func (s *DockerNetworkSuite) TestConntrackFlowsLeak(c *testing.T) {
17781779
cli.DockerCmd(c, "run", "-d", "--name", "client", "--net=host", "busybox", "sh", "-c", cmd)
17791780

17801781
// Get all the flows using netlink
1781-
flows, err := netlink.ConntrackTableList(netlink.ConntrackTable, unix.AF_INET)
1782+
flows, err := nlwrap.ConntrackTableList(netlink.ConntrackTable, unix.AF_INET)
17821783
assert.NilError(c, err)
17831784
var flowMatch int
17841785
for _, flow := range flows {
@@ -1796,7 +1797,7 @@ func (s *DockerNetworkSuite) TestConntrackFlowsLeak(c *testing.T) {
17961797
cli.DockerCmd(c, "rm", "-fv", "server")
17971798

17981799
// Fetch again all the flows and validate that there is no server flow in the conntrack laying around
1799-
flows, err = netlink.ConntrackTableList(netlink.ConntrackTable, unix.AF_INET)
1800+
flows, err = nlwrap.ConntrackTableList(netlink.ConntrackTable, unix.AF_INET)
18001801
assert.NilError(c, err)
18011802
flowMatch = 0
18021803
for _, flow := range flows {

integration-cli/docker_cli_swarm_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"github.com/docker/docker/integration-cli/checker"
2323
"github.com/docker/docker/integration-cli/cli"
2424
"github.com/docker/docker/integration-cli/daemon"
25+
"github.com/docker/docker/internal/nlwrap"
2526
"github.com/docker/docker/libnetwork/driverapi"
2627
"github.com/docker/docker/libnetwork/ipamapi"
2728
remoteipam "github.com/docker/docker/libnetwork/ipams/remote/api"
@@ -718,7 +719,7 @@ func setupRemoteGlobalNetworkPlugin(t *testing.T, mux *http.ServeMux, url, netDr
718719

719720
mux.HandleFunc(fmt.Sprintf("/%s.DeleteEndpoint", driverapi.NetworkPluginEndpointType), func(w http.ResponseWriter, r *http.Request) {
720721
w.Header().Set("Content-Type", plugins.VersionMimetype)
721-
if link, err := netlink.LinkByName("cnt0"); err == nil {
722+
if link, err := nlwrap.LinkByName("cnt0"); err == nil {
722723
err := netlink.LinkDel(link)
723724
assert.NilError(t, err)
724725
}

internal/nlwrap/nlwrap_linux.go

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
// Package nlwrap wraps vishvandanda/netlink functions that may return EINTR.
2+
//
3+
// A Handle instantiated using [NewHandle] or [NewHandleAt] can be used in place
4+
// of a netlink.Handle, it's a wrapper that replaces methods that need to be
5+
// wrapped. Functions that use the package handle need to be called as "nlwrap.X"
6+
// instead of "netlink.X".
7+
//
8+
// The wrapped functions currently return EINTR when NLM_F_DUMP_INTR flagged
9+
// in a netlink response, meaning something changed during the dump so results
10+
// may be incomplete or inconsistent.
11+
//
12+
// At present, the possibly incomplete/inconsistent results are not returned
13+
// by netlink functions along with the EINTR. So, it's not possible to do
14+
// anything but retry. After maxAttempts the EINTR will be returned to the
15+
// caller.
16+
package nlwrap
17+
18+
import (
19+
"context"
20+
"errors"
21+
22+
"github.com/containerd/log"
23+
"github.com/vishvananda/netlink"
24+
"github.com/vishvananda/netns"
25+
"golang.org/x/sys/unix"
26+
)
27+
28+
// Arbitrary limit on max attempts at netlink calls if they are repeatedly interrupted.
29+
const maxAttempts = 5
30+
31+
type Handle struct {
32+
*netlink.Handle
33+
}
34+
35+
func NewHandle(nlFamilies ...int) (Handle, error) {
36+
nlh, err := netlink.NewHandle(nlFamilies...)
37+
if err != nil {
38+
return Handle{}, err
39+
}
40+
return Handle{nlh}, nil
41+
}
42+
43+
func NewHandleAt(ns netns.NsHandle, nlFamilies ...int) (Handle, error) {
44+
nlh, err := netlink.NewHandleAt(ns, nlFamilies...)
45+
if err != nil {
46+
return Handle{}, err
47+
}
48+
return Handle{nlh}, nil
49+
}
50+
51+
func (h Handle) Close() {
52+
if h.Handle != nil {
53+
h.Handle.Close()
54+
}
55+
}
56+
57+
func retryOnIntr(f func() error) {
58+
for attempt := 0; attempt < maxAttempts; attempt += 1 {
59+
if err := f(); !errors.Is(err, unix.EINTR) {
60+
return
61+
}
62+
}
63+
log.G(context.TODO()).Infof("netlink call interrupted after %d attempts", maxAttempts)
64+
}
65+
66+
// AddrList calls nlh.LinkList, retrying if necessary.
67+
func (nlh Handle) AddrList(link netlink.Link, family int) (addrs []netlink.Addr, err error) {
68+
retryOnIntr(func() error {
69+
addrs, err = nlh.Handle.AddrList(link, family)
70+
return err
71+
})
72+
return addrs, err
73+
}
74+
75+
// AddrList calls netlink.LinkList, retrying if necessary.
76+
func AddrList(link netlink.Link, family int) (addrs []netlink.Addr, err error) {
77+
retryOnIntr(func() error {
78+
addrs, err = netlink.AddrList(link, family)
79+
return err
80+
})
81+
return addrs, err
82+
}
83+
84+
// ConntrackDeleteFilters calls nlh.ConntrackDeleteFilters, retrying if necessary.
85+
func (nlh Handle) ConntrackDeleteFilters(
86+
table netlink.ConntrackTableType,
87+
family netlink.InetFamily,
88+
filters ...netlink.CustomConntrackFilter,
89+
) (matched uint, err error) {
90+
retryOnIntr(func() error {
91+
matched, err = nlh.Handle.ConntrackDeleteFilters(table, family, filters...)
92+
return err
93+
})
94+
return matched, err
95+
}
96+
97+
// ConntrackTableList calls netlink.ConntrackTableList, retrying if necessary.
98+
func ConntrackTableList(
99+
table netlink.ConntrackTableType,
100+
family netlink.InetFamily,
101+
) (flows []*netlink.ConntrackFlow, err error) {
102+
retryOnIntr(func() error {
103+
flows, err = netlink.ConntrackTableList(table, family)
104+
return err
105+
})
106+
return flows, err
107+
}
108+
109+
// LinkByName calls nlh.LinkByName, retrying if necessary. The netlink function
110+
// doesn't normally ask the kernel for a dump of links. But, on an old kernel, it
111+
// will do as a fallback and that dump may get inconsistent results.
112+
func (nlh Handle) LinkByName(name string) (link netlink.Link, err error) {
113+
retryOnIntr(func() error {
114+
link, err = nlh.Handle.LinkByName(name)
115+
return err
116+
})
117+
return link, err
118+
}
119+
120+
// LinkByName calls netlink.LinkByName, retrying if necessary. The netlink
121+
// function doesn't normally ask the kernel for a dump of links. But, on an old
122+
// kernel, it will do as a fallback and that dump may get inconsistent results.
123+
func LinkByName(name string) (link netlink.Link, err error) {
124+
retryOnIntr(func() error {
125+
link, err = netlink.LinkByName(name)
126+
return err
127+
})
128+
return link, err
129+
}
130+
131+
// LinkList calls nlh.LinkList, retrying if necessary.
132+
func (nlh Handle) LinkList() (links []netlink.Link, err error) {
133+
retryOnIntr(func() error {
134+
links, err = nlh.Handle.LinkList()
135+
return err
136+
})
137+
return links, err
138+
}
139+
140+
// LinkList calls netlink.LinkList, retrying if necessary.
141+
func LinkList() (links []netlink.Link, err error) {
142+
retryOnIntr(func() error {
143+
links, err = netlink.LinkList()
144+
return err
145+
})
146+
return links, err
147+
}
148+
149+
// RouteList calls nlh.RouteList, retrying if necessary.
150+
func (nlh Handle) RouteList(link netlink.Link, family int) (routes []netlink.Route, err error) {
151+
retryOnIntr(func() error {
152+
routes, err = nlh.Handle.RouteList(link, family)
153+
return err
154+
})
155+
return routes, err
156+
}
157+
158+
func (nlh Handle) XfrmPolicyList(family int) (policies []netlink.XfrmPolicy, err error) {
159+
retryOnIntr(func() error {
160+
policies, err = nlh.Handle.XfrmPolicyList(family)
161+
return err
162+
})
163+
return policies, err
164+
}
165+
166+
func (nlh Handle) XfrmStateList(family int) (states []netlink.XfrmState, err error) {
167+
retryOnIntr(func() error {
168+
states, err = nlh.Handle.XfrmStateList(family)
169+
return err
170+
})
171+
return states, err
172+
}

libnetwork/drivers/bridge/bridge_linux.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
"github.com/containerd/log"
1414
"github.com/docker/docker/errdefs"
15+
"github.com/docker/docker/internal/nlwrap"
1516
"github.com/docker/docker/libnetwork/datastore"
1617
"github.com/docker/docker/libnetwork/driverapi"
1718
"github.com/docker/docker/libnetwork/drivers/bridge/internal/rlkclient"
@@ -155,7 +156,7 @@ type driver struct {
155156
isolationChain2V6 *iptables.ChainInfo
156157
networks map[string]*bridgeNetwork
157158
store *datastore.Store
158-
nlh *netlink.Handle
159+
nlh nlwrap.Handle
159160
portDriverClient portDriverClient
160161
configNetwork sync.Mutex
161162
sync.Mutex
@@ -808,7 +809,7 @@ func (d *driver) checkConflict(config *networkConfiguration) error {
808809
func (d *driver) createNetwork(config *networkConfiguration) (err error) {
809810
// Initialize handle when needed
810811
d.Lock()
811-
if d.nlh == nil {
812+
if d.nlh.Handle == nil {
812813
d.nlh = ns.NlHandle()
813814
}
814815
d.Unlock()
@@ -1018,7 +1019,7 @@ func (d *driver) deleteNetwork(nid string) error {
10181019
return d.storeDelete(config)
10191020
}
10201021

1021-
func addToBridge(ctx context.Context, nlh *netlink.Handle, ifaceName, bridgeName string) error {
1022+
func addToBridge(ctx context.Context, nlh nlwrap.Handle, ifaceName, bridgeName string) error {
10221023
ctx, span := otel.Tracer("").Start(ctx, "libnetwork.drivers.bridge.addToBridge", trace.WithAttributes(
10231024
attribute.String("ifaceName", ifaceName),
10241025
attribute.String("bridgeName", bridgeName)))
@@ -1035,7 +1036,7 @@ func addToBridge(ctx context.Context, nlh *netlink.Handle, ifaceName, bridgeName
10351036
return nil
10361037
}
10371038

1038-
func setHairpinMode(nlh *netlink.Handle, link netlink.Link, enable bool) error {
1039+
func setHairpinMode(nlh nlwrap.Handle, link netlink.Link, enable bool) error {
10391040
err := nlh.LinkSetHairpin(link, enable)
10401041
if err != nil {
10411042
return fmt.Errorf("unable to set hairpin mode on %s via netlink: %v",

libnetwork/drivers/bridge/bridge_linux_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"strconv"
1212
"testing"
1313

14+
"github.com/docker/docker/internal/nlwrap"
1415
"github.com/docker/docker/internal/testutils/netnsutils"
1516
"github.com/docker/docker/libnetwork/driverapi"
1617
"github.com/docker/docker/libnetwork/internal/netiputil"
@@ -1226,7 +1227,7 @@ func TestCreateWithExistingBridge(t *testing.T) {
12261227
t.Fatalf("Failed to delete network %s: %v", brName, err)
12271228
}
12281229

1229-
if _, err := netlink.LinkByName(brName); err != nil {
1230+
if _, err := nlwrap.LinkByName(brName); err != nil {
12301231
t.Fatal("Deleting bridge network that using existing bridge interface unexpectedly deleted the bridge interface")
12311232
}
12321233
}

libnetwork/drivers/bridge/interface_linux.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88

99
"github.com/containerd/log"
1010
"github.com/docker/docker/errdefs"
11+
"github.com/docker/docker/internal/nlwrap"
1112
"github.com/docker/docker/libnetwork/internal/netiputil"
1213
"github.com/vishvananda/netlink"
1314
)
@@ -25,14 +26,14 @@ type bridgeInterface struct {
2526
bridgeIPv6 *net.IPNet
2627
gatewayIPv4 net.IP
2728
gatewayIPv6 net.IP
28-
nlh *netlink.Handle
29+
nlh nlwrap.Handle
2930
}
3031

3132
// newInterface creates a new bridge interface structure. It attempts to find
3233
// an already existing device identified by the configuration BridgeName field,
3334
// or the default bridge name when unspecified, but doesn't attempt to create
3435
// one when missing
35-
func newInterface(nlh *netlink.Handle, config *networkConfiguration) (*bridgeInterface, error) {
36+
func newInterface(nlh nlwrap.Handle, config *networkConfiguration) (*bridgeInterface, error) {
3637
var err error
3738
i := &bridgeInterface{nlh: nlh}
3839

libnetwork/drivers/bridge/interface_linux_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"sort"
66
"testing"
77

8+
"github.com/docker/docker/internal/nlwrap"
89
"github.com/docker/docker/internal/testutils/netnsutils"
910
"github.com/vishvananda/netlink"
1011
"golang.org/x/sys/unix"
@@ -28,7 +29,7 @@ func addAddr(t *testing.T, link netlink.Link, addr string) {
2829

2930
func prepTestBridge(t *testing.T, nc *networkConfiguration) *bridgeInterface {
3031
t.Helper()
31-
nh, err := netlink.NewHandle()
32+
nh, err := nlwrap.NewHandle()
3233
assert.Assert(t, err)
3334
i, err := newInterface(nh, nc)
3435
assert.Assert(t, err)
@@ -40,7 +41,7 @@ func prepTestBridge(t *testing.T, nc *networkConfiguration) *bridgeInterface {
4041
func TestInterfaceDefaultName(t *testing.T) {
4142
defer netnsutils.SetupTestOSContext(t)()
4243

43-
nh, err := netlink.NewHandle()
44+
nh, err := nlwrap.NewHandle()
4445
if err != nil {
4546
t.Fatal(err)
4647
}
@@ -60,7 +61,7 @@ func TestAddressesNoInterface(t *testing.T) {
6061
func TestAddressesEmptyInterface(t *testing.T) {
6162
defer netnsutils.SetupTestOSContext(t)()
6263

63-
nh, err := netlink.NewHandle()
64+
nh, err := nlwrap.NewHandle()
6465
assert.NilError(t, err)
6566

6667
inf, err := newInterface(nh, &networkConfiguration{})

0 commit comments

Comments
 (0)