Skip to content

Commit 1c2b744

Browse files
committed
libnetwork/d/overlay: properly model peer db
The overlay driver assumes that the peer table in NetworkDB will always converge to a 1:1:1 mapping from peer endpoint IP address to MAC address to VTEP. While this currently holds true in practice most of the time, it is not an invariant and there are ways that users can violate this assumption. The driver detects whether peer entries conflict with each other by matching up (IP, MAC) tuples. In the common case this works out fine as the MAC address for an endpoint is generally derived from the assigned IP address. If an IP address gets reassigned to a container on another node the MAC address will follow, so the driver's conflict resolution logic will behave as intended. However users may explicitly configure the MAC address for a container's network endpoints. If an IP address gets reassigned from a container with an auto-generated MAC address to a container with a manually-configured MAC, or vice versa, the driver would not detect the conflict as the (IP, MAC) tuples won't match up. It would attempt to program the kernel's neighbor table with two conflicting MAC addresses for one IP, which will fail. And since it does not realize that there is a conflict, the driver won't reprogram the kernel from the remaining entry when the other entry is deleted. The assumption that only one IP address may resolve to a given MAC address is violated if multiple IP addresses are assigned to an endpoint. This rarely comes up in practice today as the overlay driver only supports IPv4 single-stack connectivity for endpoints. If multiple distinct peer entries exist with the same MAC address, the driver will delete the MAC->VTEP mapping from the kernel's forwarding database when any entry is deleted, even if other entries remain active. This limitation is one of the biggest obstacles in the way of supporting IPv6 and dual-stack connectivity for endpoints attached to overlay networks. Modify the peer db logic to correctly handle the cases where peer entries have non-unique MAC or VTEP values. Treat any set of entries with non-unique IP addresses as a conflict, irrespective of the entries' MAC addresses. Maintain a reference count of forwarding database entries and only delete the MAC->VTEP mapping from the kernel when there are no longer any neighbor entries which resolve to that MAC. Signed-off-by: Cory Snider <[email protected]>
1 parent 59437f5 commit 1c2b744

4 files changed

Lines changed: 106 additions & 69 deletions

File tree

libnetwork/drivers/overlay/ov_network.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
//go:build linux
1+
// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
2+
//go:build go1.23 && linux
23

34
package overlay
45

@@ -18,6 +19,7 @@ import (
1819
"github.com/docker/docker/internal/nlwrap"
1920
"github.com/docker/docker/libnetwork/driverapi"
2021
"github.com/docker/docker/libnetwork/drivers/overlay/overlayutils"
22+
"github.com/docker/docker/libnetwork/internal/countmap"
2123
"github.com/docker/docker/libnetwork/internal/netiputil"
2224
"github.com/docker/docker/libnetwork/netlabel"
2325
"github.com/docker/docker/libnetwork/ns"
@@ -53,6 +55,8 @@ type network struct {
5355
endpoints endpointTable
5456
driver *driver
5557
joinCnt int
58+
// Ref count of VXLAN Forwarding Database entries programmed into the kernel
59+
fdbCnt countmap.Map[ipmac]
5660
sboxInit bool
5761
initEpoch int
5862
initErr error
@@ -99,6 +103,7 @@ func (d *driver) CreateNetwork(ctx context.Context, id string, option map[string
99103
driver: d,
100104
endpoints: endpointTable{},
101105
subnets: []*subnet{},
106+
fdbCnt: countmap.Map[ipmac]{},
102107
}
103108

104109
vnis := make([]uint32, 0, len(ipV4Data))
@@ -586,6 +591,7 @@ func (n *network) initSandbox() error {
586591

587592
// this is needed to let the peerAdd configure the sandbox
588593
n.sbox = sbox
594+
n.fdbCnt = countmap.Map[ipmac]{}
589595

590596
return nil
591597
}

libnetwork/drivers/overlay/peerdb.go

Lines changed: 53 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,17 @@ import (
2020
const ovPeerTable = "overlay_peer_table"
2121

2222
type peerEntry struct {
23-
eid string
24-
vtep netip.Addr // Virtual Tunnel End Point for non-local peers
25-
prefixBits int // number of 1-bits in network mask of peerIP
23+
eid string
24+
mac macAddr
25+
vtep netip.Addr
2626
}
2727

2828
func (p *peerEntry) isLocal() bool {
2929
return !p.vtep.IsValid()
3030
}
3131

3232
type peerMap struct {
33-
// set of peerEntry, note the values have to be objects and not pointers to maintain the proper equality checks
34-
mp setmatrix.SetMatrix[ipmac, peerEntry]
33+
mp setmatrix.SetMatrix[netip.Prefix, peerEntry]
3534
sync.Mutex
3635
}
3736

@@ -41,16 +40,16 @@ type peerNetworkMap struct {
4140
sync.Mutex
4241
}
4342

44-
func (d *driver) peerDbNetworkWalk(nid string, f func(netip.Addr, net.HardwareAddr, *peerEntry) bool) error {
43+
func (d *driver) peerDbNetworkWalk(nid string, f func(netip.Prefix, peerEntry) bool) {
4544
d.peerDb.Lock()
4645
pMap, ok := d.peerDb.mp[nid]
4746
d.peerDb.Unlock()
4847

4948
if !ok {
50-
return nil
49+
return
5150
}
5251

53-
mp := map[ipmac]peerEntry{}
52+
mp := map[netip.Prefix]peerEntry{}
5453
pMap.Lock()
5554
for _, pKey := range pMap.mp.Keys() {
5655
entryDBList, ok := pMap.mp.Get(pKey)
@@ -60,38 +59,28 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(netip.Addr, net.HardwareAd
6059
}
6160
pMap.Unlock()
6261

63-
for pKey, pEntry := range mp {
64-
if f(pKey.ip, pKey.mac.HardwareAddr(), &pEntry) {
65-
return nil
62+
for k, v := range mp {
63+
if f(k, v) {
64+
return
6665
}
6766
}
68-
69-
return nil
7067
}
7168

72-
func (d *driver) peerDbSearch(nid string, peerIP netip.Addr) (netip.Addr, net.HardwareAddr, *peerEntry, error) {
73-
var peerIPMatched netip.Addr
74-
var peerMacMatched net.HardwareAddr
75-
var pEntryMatched *peerEntry
76-
err := d.peerDbNetworkWalk(nid, func(ip netip.Addr, mac net.HardwareAddr, pEntry *peerEntry) bool {
77-
if ip == peerIP {
78-
peerIPMatched = ip
79-
peerMacMatched = mac
80-
pEntryMatched = pEntry
81-
return true
82-
}
83-
84-
return false
85-
})
86-
if err != nil {
87-
return netip.Addr{}, nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
69+
func (d *driver) peerDbGet(nid string, peerIP netip.Prefix) (peerEntry, bool) {
70+
d.peerDb.Lock()
71+
pMap, ok := d.peerDb.mp[nid]
72+
d.peerDb.Unlock()
73+
if !ok {
74+
return peerEntry{}, false
8875
}
8976

90-
if !peerIPMatched.IsValid() || pEntryMatched == nil {
91-
return netip.Addr{}, nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
77+
pMap.Lock()
78+
defer pMap.Unlock()
79+
c, _ := pMap.mp.Get(peerIP)
80+
if len(c) == 0 {
81+
return peerEntry{}, false
9282
}
93-
94-
return peerIPMatched, peerMacMatched, pEntryMatched, nil
83+
return c[0], true
9584
}
9685

9786
func (d *driver) peerDbAdd(nid, eid string, peerIP netip.Prefix, peerMac net.HardwareAddr, vtep netip.Addr) (bool, int) {
@@ -103,22 +92,21 @@ func (d *driver) peerDbAdd(nid, eid string, peerIP netip.Prefix, peerMac net.Har
10392
}
10493
d.peerDb.Unlock()
10594

106-
pKey := ipmacOf(peerIP.Addr(), peerMac)
107-
10895
pEntry := peerEntry{
109-
eid: eid,
110-
vtep: vtep,
111-
prefixBits: peerIP.Bits(),
96+
eid: eid,
97+
mac: macAddrOf(peerMac),
98+
vtep: vtep,
11299
}
113100

114101
pMap.Lock()
115102
defer pMap.Unlock()
116-
b, i := pMap.mp.Insert(pKey, pEntry)
103+
b, i := pMap.mp.Insert(peerIP, pEntry)
117104
if i != 1 {
118-
// Transient case, there is more than one endpoint that is using the same IP,MAC pair
119-
s, _ := pMap.mp.String(pKey)
120-
log.G(context.TODO()).Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
105+
// Transient case, there is more than one endpoint that is using the same IP
106+
s, _ := pMap.mp.String(peerIP)
107+
log.G(context.TODO()).Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", peerIP, i, s)
121108
}
109+
122110
return b, i
123111
}
124112

@@ -131,21 +119,19 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP netip.Prefix, peerMac net.
131119
}
132120
d.peerDb.Unlock()
133121

134-
pKey := ipmacOf(peerIP.Addr(), peerMac)
135-
136122
pEntry := peerEntry{
137-
eid: eid,
138-
vtep: vtep,
139-
prefixBits: peerIP.Bits(),
123+
eid: eid,
124+
mac: macAddrOf(peerMac),
125+
vtep: vtep,
140126
}
141127

142128
pMap.Lock()
143129
defer pMap.Unlock()
144-
b, i := pMap.mp.Remove(pKey, pEntry)
130+
b, i := pMap.mp.Remove(peerIP, pEntry)
145131
if i != 0 {
146-
// Transient case, there is more than one endpoint that is using the same IP,MAC pair
147-
s, _ := pMap.mp.String(pKey)
148-
log.G(context.TODO()).Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey, i, s)
132+
// Transient case, there is more than one endpoint that is using the same IP
133+
s, _ := pMap.mp.String(peerIP)
134+
log.G(context.TODO()).Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", peerIP, i, s)
149135
}
150136
return b, i
151137
}
@@ -164,16 +150,12 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP netip.Prefix, peerMac net.
164150
func (d *driver) initSandboxPeerDB(nid string) {
165151
d.peerOpMu.Lock()
166152
defer d.peerOpMu.Unlock()
167-
err := d.peerDbNetworkWalk(nid, func(peerIP netip.Addr, peerMac net.HardwareAddr, pEntry *peerEntry) bool {
153+
d.peerDbNetworkWalk(nid, func(peerIP netip.Prefix, pEntry peerEntry) bool {
168154
if !pEntry.isLocal() {
169-
d.addNeighbor(nid, netip.PrefixFrom(peerIP, pEntry.prefixBits), peerMac, pEntry.vtep)
155+
d.addNeighbor(nid, peerIP, pEntry.mac.HardwareAddr(), pEntry.vtep)
170156
}
171157
return false // walk all entries
172158
})
173-
174-
if err != nil {
175-
log.G(context.TODO()).WithError(err).Warn("Peer init operation failed")
176-
}
177159
}
178160

179161
// peerAdd adds a new entry to the peer database.
@@ -242,8 +224,10 @@ func (d *driver) addNeighbor(nid string, peerIP netip.Prefix, peerMac net.Hardwa
242224
}
243225

244226
// Add fdb entry to the bridge for the peer mac
245-
if err := sbox.AddNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil {
246-
return fmt.Errorf("could not add fdb entry into the sandbox: %w", err)
227+
if n.fdbCnt.Add(ipmacOf(vtep, peerMac), 1) == 1 {
228+
if err := sbox.AddNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil {
229+
return fmt.Errorf("could not add fdb entry into the sandbox: %w", err)
230+
}
247231
}
248232

249233
return nil
@@ -287,23 +271,22 @@ func (d *driver) peerDelete(nid, eid string, peerIP netip.Prefix, peerMac net.Ha
287271
if dbEntries > 0 {
288272
// If there is still an entry into the database and the deletion went through without errors means that there is now no
289273
// configuration active in the kernel.
290-
// Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
291-
peerIPAddr, peerMac, peerEntry, err := d.peerDbSearch(nid, peerIP.Addr())
292-
if err != nil {
274+
// Restore one configuration for the ip directly from the database, note that is guaranteed that there is one
275+
peerEntry, ok := d.peerDbGet(nid, peerIP)
276+
if !ok {
293277
log.G(context.TODO()).WithFields(log.Fields{
294278
"nid": nid,
295279
"ip": peerIP,
296-
}).WithError(err).Error("peerDelete unable to restore a configuration")
280+
}).Error("peerDelete unable to restore a configuration: no entry found in the database")
297281
return
298282
}
299-
peerIP = netip.PrefixFrom(peerIPAddr, peerEntry.prefixBits)
300-
err = d.addNeighbor(nid, peerIP, peerMac, peerEntry.vtep)
283+
err = d.addNeighbor(nid, peerIP, peerEntry.mac.HardwareAddr(), peerEntry.vtep)
301284
if err != nil {
302285
log.G(context.TODO()).WithFields(log.Fields{
303286
"nid": nid,
304287
"eid": eid,
305288
"ip": peerIP,
306-
"mac": peerMac,
289+
"mac": peerEntry.mac,
307290
"vtep": peerEntry.vtep,
308291
}).WithError(err).Error("Peer delete operation failed")
309292
}
@@ -335,8 +318,10 @@ func (d *driver) deleteNeighbor(nid string, peerIP netip.Prefix, peerMac net.Har
335318
return fmt.Errorf("could not find the subnet %q in network %q", peerIP.String(), n.id)
336319
}
337320
// Remove fdb entry to the bridge for the peer mac
338-
if err := sbox.DeleteNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil {
339-
return fmt.Errorf("could not delete fdb entry in the sandbox: %w", err)
321+
if n.fdbCnt.Add(ipmacOf(vtep, peerMac), -1) == 0 {
322+
if err := sbox.DeleteNeighbor(vtep.AsSlice(), peerMac, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil {
323+
return fmt.Errorf("could not delete fdb entry in the sandbox: %w", err)
324+
}
340325
}
341326

342327
// Delete neighbor entry for the peer IP
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// FIXME(thaJeztah): remove once we are a module; the go:build directive prevents go from downgrading language version to go1.16:
2+
//go:build go1.23
3+
4+
package countmap
5+
6+
// Map is a map of counters.
7+
type Map[T comparable] map[T]int
8+
9+
// Add adds delta to the counter for v and returns the new value.
10+
//
11+
// If the new value is 0, the entry is removed from the map.
12+
func (m Map[T]) Add(v T, delta int) int {
13+
m[v] += delta
14+
c := m[v]
15+
if c == 0 {
16+
delete(m, v)
17+
}
18+
return c
19+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package countmap_test
2+
3+
import (
4+
"testing"
5+
6+
"github.com/docker/docker/libnetwork/internal/countmap"
7+
"gotest.tools/v3/assert"
8+
is "gotest.tools/v3/assert/cmp"
9+
)
10+
11+
func TestMap(t *testing.T) {
12+
m := countmap.Map[string]{}
13+
m["foo"] = 7
14+
m["bar"] = 2
15+
m["zeroed"] = -2
16+
17+
m.Add("bar", -3)
18+
m.Add("foo", -8)
19+
m.Add("baz", 1)
20+
m.Add("zeroed", 2)
21+
assert.Check(t, is.DeepEqual(m, countmap.Map[string]{"foo": -1, "bar": -1, "baz": 1}))
22+
23+
m.Add("foo", 1)
24+
m.Add("bar", 1)
25+
m.Add("baz", -1)
26+
assert.Check(t, is.DeepEqual(m, countmap.Map[string]{}))
27+
}

0 commit comments

Comments
 (0)