Skip to content

Commit c79630b

Browse files
Fix flaky test_short_disconnection_doesnt_stop_backup and test_short_disconnection_doesnt_stop_restore
When `faster_zk_disconnect_detect.xml` is randomly chosen (which sets `session_timeout_ms=5000`), the ZK connection drop via iptables must be short enough to avoid session expiry. Previously, the drop duration was up to 3-4 seconds via `random_sleep`. Combined with the time since the last heartbeat (~1.7s for a 5s session timeout) and reconnection overhead, the total silence could exceed 5 seconds, causing the ZK session to expire and the backup/restore to fail regardless of the 30-second `failure_after_host_disconnected_for_seconds` threshold. The fix limits the drop duration to 1 second when using the faster ZK disconnect detection config, while keeping the original duration when using default ZK settings. CI report: https://s3.amazonaws.com/clickhouse-test-reports/json.html?PR=96758&sha=d29b41fbe684f8c90ace4fd71828ce0d4ac8b88f&name_0=PR&name_1=Integration%20tests%20%28arm_binary%2C%20distributed%20plan%2C%204%2F4%29 Closes: #80359 Co-Authored-By: Claude Sonnet 4.5 (1M context) <[email protected]>
1 parent 190cda3 commit c79630b

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -679,14 +679,17 @@ def test_short_disconnection_doesnt_stop_backup():
679679
assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
680680

681681
# Dropping connection for less than `failure_after_host_disconnected_for_seconds`
682+
# When using faster_zk_disconnect_detect.xml (session_timeout_ms=5000),
683+
# the drop duration must be short enough to avoid ZK session expiry.
684+
max_drop_seconds = 1 if use_faster_zk_disconnect_detect else 4
682685
with PartitionManager() as pm:
683686
random_sleep(4)
684687
node_to_drop_zk_connection = random_node()
685688
print(
686689
f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper at {format_current_time()}"
687690
)
688691
pm.drop_instance_zk_connections(node_to_drop_zk_connection)
689-
random_sleep(4)
692+
random_sleep(max_drop_seconds)
690693
print(
691694
f"Restoring connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper at {format_current_time()}"
692695
)
@@ -733,14 +736,17 @@ def test_short_disconnection_doesnt_stop_restore():
733736
assert get_num_system_processes(initiator, restore_id=restore_id) >= 1
734737

735738
# Dropping connection for less than `failure_after_host_disconnected_for_seconds`
739+
# When using faster_zk_disconnect_detect.xml (session_timeout_ms=5000),
740+
# the drop duration must be short enough to avoid ZK session expiry.
741+
max_drop_seconds = 1 if use_faster_zk_disconnect_detect else 3
736742
with PartitionManager() as pm:
737743
random_sleep(3)
738744
node_to_drop_zk_connection = random_node()
739745
print(
740746
f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper at {format_current_time()}"
741747
)
742748
pm.drop_instance_zk_connections(node_to_drop_zk_connection)
743-
random_sleep(3)
749+
random_sleep(max_drop_seconds)
744750
print(
745751
f"Restoring connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper at {format_current_time()}"
746752
)

0 commit comments

Comments
 (0)