libn/networkdb: record tombstones for all deletes

corhere · corhere · commit ada8bc36958a · 2025-05-13T14:09:51.000-04:00
The gossip protocol which powers NetworkDB does not guarantee in-order
reception of events. This poses a problem with deleting entries: without
some mechanism to discard stale CREATE or UPDATE events received after a
DELETE, out-of-order reception of events could result in a deleted entry
being spuriously resurrected in the local NetworkDB state! NetworkDB
handles this situation by storing "tombstone" entries for a period of
time with the Lamport timestamps of the entries' respective DELETE
events. Out-of-order CREATE or UPDATE events will be ignored by virtue
of having older timestmaps than the tombstone entry, just like how it
works for entries that have not yet been deleted.

NetworkDB was only storing a tombstone if the entry was already present
in the local database at the time of the DELETE event. If the first
event received for an entry is a DELETE, no tombstone is stored. If a
stale CREATE/UPDATE event for the entry (with an older timestamp than
the DELETE) is subsequently received, NetworkDB erroneously creates a
live entry in the local state with stale data. Modify NetworkDB to store
tombstones for DELETE events irrespective of whether the entry was known
to NetworkDB beforehand so that it correctly discards out-of-order
CREATEs and UPDATEs in all cases.

Signed-off-by: Cory Snider &lt;csnider@mirantis.com&gt;
diff --git a/libnetwork/networkdb/delegate.go b/libnetwork/networkdb/delegate.go
@@ -179,14 +179,6 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent, isBulkSync bool) bool
 			nDB.Unlock()
 			return false
 		}
-	} else if tEvent.Type == TableEventTypeDelete && !isBulkSync {
-		nDB.Unlock()
-		// We don't know the entry, the entry is being deleted and the message is an async message
-		// In this case the safest approach is to ignore it, it is possible that the queue grew so much to
-		// exceed the garbage collection time (the residual reap time that is in the message is not being
-		// updated, to avoid inserting too many messages in the queue).
-		// Instead the messages coming from TCP bulk sync are safe with the latest value for the garbage collection time
-		return false
 	}
 
 	e := &entry{
@@ -208,18 +200,24 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent, isBulkSync bool) bool
 	nDB.createOrUpdateEntry(tEvent.NetworkID, tEvent.TableName, tEvent.Key, e)
 	nDB.Unlock()
 
-	if err != nil && tEvent.Type == TableEventTypeDelete {
-		// Again we don't know the entry but this is coming from a TCP sync so the message body is up to date.
-		// We had saved the state so to speed up convergence and be able to avoid accepting create events.
-		// Now we will rebroadcast the message if 2 conditions are met:
-		// 1) we had already synced this network (during the network join)
-		// 2) the residual reapTime is higher than 1/6 of the total reapTime.
-		// If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around
-		// most likely the cluster is already aware of it
-		// This also reduce the possibility that deletion of entries close to their garbage collection ends up circling around
-		// forever
+	if !entryPresent && tEvent.Type == TableEventTypeDelete {
+		// We will rebroadcast the message for an unknown entry if all the conditions are met:
+		// 1) the message was received from a bulk sync
+		// 2) we had already synced this network (during the network join)
+		// 3) the residual reapTime is higher than 1/6 of the total reapTime.
+		//
+		// If the residual reapTime is lower or equal to 1/6 of the total reapTime
+		// don't bother broadcasting it around as most likely the cluster is already aware of it.
+		// This also reduces the possibility that deletion of entries close to their garbage collection
+		// ends up circling around forever.
+		//
+		// The safest approach is to not rebroadcast async messages for unknown entries.
+		// It is possible that the queue grew so much to exceed the garbage collection time
+		// (the residual reap time that is in the message is not being updated, to avoid
+		// inserting too many messages in the queue).
+
 		// log.G(ctx).Infof("exiting on delete not knowing the obj with rebroadcast:%t", network.inSync)
-		return network.inSync && e.reapTime > nDB.config.reapEntryInterval/6
+		return isBulkSync && network.inSync && e.reapTime > nDB.config.reapEntryInterval/6
 	}
 
 	var op opType
diff --git a/libnetwork/networkdb/watch_test.go b/libnetwork/networkdb/watch_test.go
@@ -60,6 +60,12 @@ func TestWatch_out_of_order(t *testing.T) {
 	appendTableEvent(13, TableEventTypeUpdate, "key4", []byte("b"))
 	appendTableEvent(14, TableEventTypeUpdate, "key4", []byte("c"))
 
+	// Delete, create
+	appendTableEvent(16, TableEventTypeDelete, "key5", []byte("a"))
+	appendTableEvent(15, TableEventTypeCreate, "key5", []byte("a"))
+	// (Hidden recreate), delete
+	appendTableEvent(18, TableEventTypeDelete, "key5", []byte("b"))
+
 	d.NotifyMsg(msgs.Compound())
 	msgs.Reset()
 
@@ -76,6 +82,8 @@ func TestWatch_out_of_order(t *testing.T) {
 		CreateEvent(event{Table: "table1", NetworkID: "network1", Key: "key3", Value: []byte("b")}),
 		CreateEvent(event{Table: "table1", NetworkID: "network1", Key: "key4", Value: []byte("b")}),
 		UpdateEvent(event{Table: "table1", NetworkID: "network1", Key: "key4", Value: []byte("c")}),
+
+		// key5 should not appear in the events.
 	}))
 }