Skip to content

Commit ada8bc3

Browse files
committed
libn/networkdb: record tombstones for all deletes
The gossip protocol which powers NetworkDB does not guarantee in-order reception of events. This poses a problem with deleting entries: without some mechanism to discard stale CREATE or UPDATE events received after a DELETE, out-of-order reception of events could result in a deleted entry being spuriously resurrected in the local NetworkDB state! NetworkDB handles this situation by storing "tombstone" entries for a period of time with the Lamport timestamps of the entries' respective DELETE events. Out-of-order CREATE or UPDATE events will be ignored by virtue of having older timestmaps than the tombstone entry, just like how it works for entries that have not yet been deleted. NetworkDB was only storing a tombstone if the entry was already present in the local database at the time of the DELETE event. If the first event received for an entry is a DELETE, no tombstone is stored. If a stale CREATE/UPDATE event for the entry (with an older timestamp than the DELETE) is subsequently received, NetworkDB erroneously creates a live entry in the local state with stale data. Modify NetworkDB to store tombstones for DELETE events irrespective of whether the entry was known to NetworkDB beforehand so that it correctly discards out-of-order CREATEs and UPDATEs in all cases. Signed-off-by: Cory Snider <[email protected]>
1 parent c68671d commit ada8bc3

2 files changed

Lines changed: 25 additions & 19 deletions

File tree

libnetwork/networkdb/delegate.go

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -179,14 +179,6 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent, isBulkSync bool) bool
179179
nDB.Unlock()
180180
return false
181181
}
182-
} else if tEvent.Type == TableEventTypeDelete && !isBulkSync {
183-
nDB.Unlock()
184-
// We don't know the entry, the entry is being deleted and the message is an async message
185-
// In this case the safest approach is to ignore it, it is possible that the queue grew so much to
186-
// exceed the garbage collection time (the residual reap time that is in the message is not being
187-
// updated, to avoid inserting too many messages in the queue).
188-
// Instead the messages coming from TCP bulk sync are safe with the latest value for the garbage collection time
189-
return false
190182
}
191183

192184
e := &entry{
@@ -208,18 +200,24 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent, isBulkSync bool) bool
208200
nDB.createOrUpdateEntry(tEvent.NetworkID, tEvent.TableName, tEvent.Key, e)
209201
nDB.Unlock()
210202

211-
if err != nil && tEvent.Type == TableEventTypeDelete {
212-
// Again we don't know the entry but this is coming from a TCP sync so the message body is up to date.
213-
// We had saved the state so to speed up convergence and be able to avoid accepting create events.
214-
// Now we will rebroadcast the message if 2 conditions are met:
215-
// 1) we had already synced this network (during the network join)
216-
// 2) the residual reapTime is higher than 1/6 of the total reapTime.
217-
// If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around
218-
// most likely the cluster is already aware of it
219-
// This also reduce the possibility that deletion of entries close to their garbage collection ends up circling around
220-
// forever
203+
if !entryPresent && tEvent.Type == TableEventTypeDelete {
204+
// We will rebroadcast the message for an unknown entry if all the conditions are met:
205+
// 1) the message was received from a bulk sync
206+
// 2) we had already synced this network (during the network join)
207+
// 3) the residual reapTime is higher than 1/6 of the total reapTime.
208+
//
209+
// If the residual reapTime is lower or equal to 1/6 of the total reapTime
210+
// don't bother broadcasting it around as most likely the cluster is already aware of it.
211+
// This also reduces the possibility that deletion of entries close to their garbage collection
212+
// ends up circling around forever.
213+
//
214+
// The safest approach is to not rebroadcast async messages for unknown entries.
215+
// It is possible that the queue grew so much to exceed the garbage collection time
216+
// (the residual reap time that is in the message is not being updated, to avoid
217+
// inserting too many messages in the queue).
218+
221219
// log.G(ctx).Infof("exiting on delete not knowing the obj with rebroadcast:%t", network.inSync)
222-
return network.inSync && e.reapTime > nDB.config.reapEntryInterval/6
220+
return isBulkSync && network.inSync && e.reapTime > nDB.config.reapEntryInterval/6
223221
}
224222

225223
var op opType

libnetwork/networkdb/watch_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ func TestWatch_out_of_order(t *testing.T) {
6060
appendTableEvent(13, TableEventTypeUpdate, "key4", []byte("b"))
6161
appendTableEvent(14, TableEventTypeUpdate, "key4", []byte("c"))
6262

63+
// Delete, create
64+
appendTableEvent(16, TableEventTypeDelete, "key5", []byte("a"))
65+
appendTableEvent(15, TableEventTypeCreate, "key5", []byte("a"))
66+
// (Hidden recreate), delete
67+
appendTableEvent(18, TableEventTypeDelete, "key5", []byte("b"))
68+
6369
d.NotifyMsg(msgs.Compound())
6470
msgs.Reset()
6571

@@ -76,6 +82,8 @@ func TestWatch_out_of_order(t *testing.T) {
7682
CreateEvent(event{Table: "table1", NetworkID: "network1", Key: "key3", Value: []byte("b")}),
7783
CreateEvent(event{Table: "table1", NetworkID: "network1", Key: "key4", Value: []byte("b")}),
7884
UpdateEvent(event{Table: "table1", NetworkID: "network1", Key: "key4", Value: []byte("c")}),
85+
86+
// key5 should not appear in the events.
7987
}))
8088
}
8189

0 commit comments

Comments
 (0)