Skip to content

Commit b64997e

Browse files
committed
Fix race conditions in overlay network driver
The overlay network driver is not properly using it's mutexes or sync.Onces. It made the classic mistake of not holding a lock through various read-modify-write operations. This can result in inconsistent state storage leading to more catastrophic issues. This patch attempts to maintain the previous semantics while holding the driver lock through operations that are read-modify-write of the driver's network state. One example of this race would be if two goroutines tried to invoke d.network() after the network ID was removed from the table. Both would try to reinstall it causing the "once" to get reinitialized twice without any lock protection. This could then lead to the "once" getting invoked twice on the same network. Furthermore, the changes to one of these network structures gets effectively discarded. It's also the case, that because there would be two simultaneous instances of the network, the various network Lock() invocations would be meaningless for race prevention. Signed-off-by: Chris Telfer <[email protected]>
1 parent 5c679b0 commit b64997e

1 file changed

Lines changed: 32 additions & 23 deletions

File tree

libnetwork/drivers/overlay/ov_network.go

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,12 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
203203
n.subnets = append(n.subnets, s)
204204
}
205205

206+
d.Lock()
207+
defer d.Unlock()
208+
if d.networks[n.id] != nil {
209+
return fmt.Errorf("attempt to create overlay network %v that already exists", n.id)
210+
}
211+
206212
if err := n.writeToStore(); err != nil {
207213
return fmt.Errorf("failed to update data store for network %v: %v", n.id, err)
208214
}
@@ -217,11 +223,13 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
217223

218224
if nInfo != nil {
219225
if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil {
226+
// XXX Undo writeToStore? No method to so. Why?
220227
return err
221228
}
222229
}
223230

224-
d.addNetwork(n)
231+
d.networks[id] = n
232+
225233
return nil
226234
}
227235

@@ -235,7 +243,15 @@ func (d *driver) DeleteNetwork(nid string) error {
235243
return err
236244
}
237245

238-
n := d.network(nid)
246+
d.Lock()
247+
defer d.Unlock()
248+
249+
// This is similar to d.network(), but we need to keep holding the lock
250+
// until we are done removing this network.
251+
n, ok := d.networks[nid]
252+
if !ok {
253+
n = d.restoreNetworkFromStore(nid)
254+
}
239255
if n == nil {
240256
return fmt.Errorf("could not find network with id %s", nid)
241257
}
@@ -255,7 +271,7 @@ func (d *driver) DeleteNetwork(nid string) error {
255271
}
256272
// flush the peerDB entries
257273
d.peerFlush(nid)
258-
d.deleteNetwork(nid)
274+
delete(d.networks, nid)
259275

260276
vnis, err := n.releaseVxlanID()
261277
if err != nil {
@@ -805,32 +821,25 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket, nsPath string) {
805821
}
806822
}
807823

808-
func (d *driver) addNetwork(n *network) {
809-
d.Lock()
810-
d.networks[n.id] = n
811-
d.Unlock()
812-
}
813-
814-
func (d *driver) deleteNetwork(nid string) {
815-
d.Lock()
816-
delete(d.networks, nid)
817-
d.Unlock()
824+
// Restore a network from the store to the driver if it is present.
825+
// Must be called with the driver locked!
826+
func (d *driver) restoreNetworkFromStore(nid string) *network {
827+
n := d.getNetworkFromStore(nid)
828+
if n != nil {
829+
n.driver = d
830+
n.endpoints = endpointTable{}
831+
n.once = &sync.Once{}
832+
d.networks[nid] = n
833+
}
834+
return n
818835
}
819836

820837
func (d *driver) network(nid string) *network {
821838
d.Lock()
839+
defer d.Unlock()
822840
n, ok := d.networks[nid]
823-
d.Unlock()
824841
if !ok {
825-
n = d.getNetworkFromStore(nid)
826-
if n != nil {
827-
n.driver = d
828-
n.endpoints = endpointTable{}
829-
n.once = &sync.Once{}
830-
d.Lock()
831-
d.networks[nid] = n
832-
d.Unlock()
833-
}
842+
n = d.restoreNetworkFromStore(nid)
834843
}
835844

836845
return n

0 commit comments

Comments
 (0)