Skip to content

Commit aff444d

Browse files
committed
libn/networkdb: make TestNetworkDBIslands not flaky
With rejoinClusterBootStrap fixed in tests, split clusters should reliably self-heal in tests as well as production. Work around the other source of flakiness in TestNetworkDBIslands: timing out waiting for a failed node to transition to gracefully left. This flake happens when one of the leaving nodes sends its NodeLeft message to the other leaving node, and the second is shut down before it has a chance to rebroadcast the message to the remaining nodes. The proper fix would be to leverage memberlist's own bookkeeping instead of duplicating it poorly with user messages, but doing so requires a change in the memberlist module. Instead have the test check that the sum of failed+left nodes is expected instead of waiting for all nodes to have failed==3 && left==0. Signed-off-by: Cory Snider <[email protected]>
1 parent 1e1be54 commit aff444d

1 file changed

Lines changed: 5 additions & 5 deletions

File tree

libnetwork/networkdb/networkdb_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -881,7 +881,7 @@ func TestParallelDelete(t *testing.T) {
881881
closeNetworkDBInstances(t, dbs)
882882
}
883883

884-
func TestFlakyNetworkDBIslands(t *testing.T) {
884+
func TestNetworkDBIslands(t *testing.T) {
885885
pollTimeout := func() time.Duration {
886886
const defaultTimeout = 120 * time.Second
887887
dl, ok := t.Deadline()
@@ -933,15 +933,15 @@ func TestFlakyNetworkDBIslands(t *testing.T) {
933933
// Verify that the nodes are actually all gone and marked appropriately
934934
for name, db := range checkDBs {
935935
db.RLock()
936-
if (len(db.leftNodes) != 3) || (len(db.failedNodes) != 0) {
936+
if (len(db.leftNodes) + len(db.failedNodes)) != 3 {
937937
for name := range db.leftNodes {
938938
t.Logf("%s: Node %s left", db.config.Hostname, name)
939939
}
940940
for name := range db.failedNodes {
941941
t.Logf("%s: Node %s failed", db.config.Hostname, name)
942942
}
943943
db.RUnlock()
944-
return poll.Continue("%s:Waiting for all nodes to cleanly leave, left: %d, failed nodes: %d", name, len(db.leftNodes), len(db.failedNodes))
944+
return poll.Continue("%s:Waiting for all nodes to leave, left: %d, failed nodes: %d", name, len(db.leftNodes), len(db.failedNodes))
945945
}
946946
db.RUnlock()
947947
t.Logf("%s: OK", name)
@@ -981,9 +981,9 @@ func TestFlakyNetworkDBIslands(t *testing.T) {
981981
}
982982
} else {
983983
// nodes from 4 to 5 has the 3 previous left nodes
984-
if len(db.leftNodes) != 3 {
984+
if (len(db.leftNodes) + len(db.failedNodes)) != 3 {
985985
db.RUnlock()
986-
return poll.Continue("%s:Waiting to have 3 leftNodes", dbs[i].config.Hostname)
986+
return poll.Continue("%s:Waiting to have 3 leftNodes+failedNodes", dbs[i].config.Hostname)
987987
}
988988
}
989989
db.RUnlock()

0 commit comments

Comments
 (0)