Skip to content

Commit 3eb84dc

Browse files
Backport #93779 to 25.11: Fix attaching Replicated DBs when the interserver host changed after restarting
1 parent 6bca76e commit 3eb84dc

File tree

2 files changed

+126
-35
lines changed

2 files changed

+126
-35
lines changed

src/Databases/DatabaseReplicated.cpp

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
#include <Core/UUID.h>
12
#include <DataTypes/DataTypeString.h>
23

34
#include <atomic>
5+
#include <tuple>
46
#include <utility>
57

68
#include <Backups/IRestoreCoordination.h>
@@ -111,6 +113,7 @@ namespace ErrorCodes
111113
extern const int SUPPORT_IS_DISABLED;
112114
extern const int ASYNC_LOAD_CANCELED;
113115
extern const int KEEPER_EXCEPTION;
116+
extern const int SYNTAX_ERROR;
114117
}
115118
namespace FailPoints
116119
{
@@ -135,6 +138,21 @@ static inline String getHostID(ContextPtr global_context, const UUID & db_uuid,
135138
return Cluster::Address::toString(host_port.first, port) + ':' + toString(db_uuid);
136139
}
137140

141+
// Return <address, port, uuid>
142+
static inline std::tuple<String, UInt16, UUID> parseHostID(const String & content)
143+
{
144+
auto pos = content.find_last_of(':');
145+
if (pos == std::string::npos || pos + 1 >= content.size())
146+
throw Exception(ErrorCodes::SYNTAX_ERROR, "Invalid host ID '{}'", content);
147+
148+
auto [address, port] = Cluster::Address::fromString(content.substr(0, pos));
149+
UUID db_uuid;
150+
if (!tryParse(db_uuid, content.substr(pos + 1)))
151+
throw Exception(ErrorCodes::SYNTAX_ERROR, "Invalid host ID '{}'", content);
152+
153+
return {address, port, db_uuid};
154+
}
155+
138156
static inline UInt64 getMetadataHash(const String & table_name, const String & metadata)
139157
{
140158
SipHash hash;
@@ -571,10 +589,39 @@ void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(LoadingStrictnessL
571589

572590
if (replica_host_id != host_id && replica_host_id != host_id_default)
573591
{
574-
throw Exception(
575-
ErrorCodes::REPLICA_ALREADY_EXISTS,
576-
"Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
577-
replica_name, shard_name, zookeeper_path, replica_host_id, host_id);
592+
UUID uuid_in_keeper = UUIDHelpers::Nil;
593+
try
594+
{
595+
uuid_in_keeper = std::get<2>(parseHostID(replica_host_id));
596+
}
597+
catch (const Exception & e)
598+
{
599+
LOG_WARNING(log, "Failed to parse host_id {} in zookeeper, error {}", replica_host_id, e.what());
600+
}
601+
602+
if (uuid_in_keeper != db_uuid)
603+
throw Exception(
604+
ErrorCodes::REPLICA_ALREADY_EXISTS,
605+
"Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
606+
replica_name,
607+
shard_name,
608+
zookeeper_path,
609+
replica_host_id,
610+
host_id);
611+
612+
// After restarting, InterserverIOAddress might change (e.g: config updated, `getFQDNOrHostName` returns a different one)
613+
// If the UUID in the keeper is the same as the current server UUID, we will update the host_id in keeper
614+
LOG_INFO(
615+
log,
616+
"Replicated database replica: {}, shard {}, zk_path: {} already exists with the same UUID, replica host ID: '{}', "
617+
"current host ID: '{}', will set the host_id to the current host ID",
618+
replica_name,
619+
shard_name,
620+
zookeeper_path,
621+
replica_host_id,
622+
host_id);
623+
current_zookeeper->set(replica_path, host_id, -1);
624+
createEmptyLogEntry(current_zookeeper);
578625
}
579626

580627
/// Before 24.6 we always created host_id with insecure port, even if cluster_auth_info.cluster_secure_connection was true.

tests/integration/test_replicated_database_interserver_host/test.py

Lines changed: 75 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22
import urllib.parse
33
from helpers.cluster import ClickHouseCluster
4+
import os
45

56
cluster = ClickHouseCluster(__file__)
67

@@ -21,53 +22,63 @@
2122
)
2223

2324

24-
@pytest.fixture(scope="module")
25-
def started_cluster():
26-
try:
27-
cluster.start()
25+
def update_interserver_http_address(node, new_address):
26+
config_path = os.path.join(os.path.dirname(__file__), "configs/config.xml")
2827

29-
# Replace NODE_NAME placeholder with actual IP addresses in config
30-
import os
31-
config_path = os.path.join(
32-
os.path.dirname(__file__), "configs/config.xml"
33-
)
28+
config_content = open(config_path).read()
29+
config_content = config_content.replace("NODE_NAME", new_address)
3430

35-
for node in [node1, node2]:
36-
config_content = open(config_path).read()
37-
config_content = config_content.replace("NODE_NAME", node.ip_address)
31+
# Debug: print the config and IP address
32+
print(f"Configuring {node.name} with address {new_address}")
33+
print(f"New config:\n{config_content}")
3834

39-
# Debug: print the config and IP address
40-
print(f"Configuring {node.name} with IP {node.ip_address}")
41-
print(f"Config content:\n{config_content}")
35+
node.exec_in_container(
36+
[
37+
"bash",
38+
"-c",
39+
'echo "${NEW_CONFIG}" > /etc/clickhouse-server/config.d/interserver_host.xml',
40+
],
41+
environment={"NEW_CONFIG": config_content},
42+
)
4243

43-
node.exec_in_container(
44-
["bash", "-c", 'echo "${NEW_CONFIG}" > /etc/clickhouse-server/config.d/interserver_host.xml'],
45-
environment={"NEW_CONFIG": config_content},
46-
)
44+
# Verify the file was written
45+
result = node.exec_in_container(
46+
["cat", "/etc/clickhouse-server/config.d/interserver_host.xml"]
47+
)
48+
print(f"Verification - Config file on {node.name}:\n{result}")
4749

48-
# Verify the file was written
49-
result = node.exec_in_container(
50-
["cat", "/etc/clickhouse-server/config.d/interserver_host.xml"]
51-
)
52-
print(f"Verification - Config file on {node.name}:\n{result}")
50+
# IMPORTANT: interserver_http_host is only loaded at startup, not by SYSTEM RELOAD CONFIG
51+
# So we need to restart ClickHouse
52+
print(
53+
f"Restarting ClickHouse on {node.name} to apply interserver_http_host config..."
54+
)
55+
node.restart_clickhouse()
5356

54-
# IMPORTANT: interserver_http_host is only loaded at startup, not by SYSTEM RELOAD CONFIG
55-
# So we need to restart ClickHouse
56-
print(f"Restarting ClickHouse on {node.name} to apply interserver_http_host config...")
57-
node.restart_clickhouse()
57+
# Verify the setting was applied
58+
interserver_host = node.query(
59+
"SELECT value FROM system.server_settings WHERE name = 'interserver_http_host'"
60+
)
61+
print(
62+
f"Verification - interserver_http_host setting on {node.name}: {interserver_host}"
63+
)
5864

59-
# Verify the setting was applied
60-
interserver_host = node.query("SELECT value FROM system.server_settings WHERE name = 'interserver_http_host'")
61-
print(f"Verification - interserver_http_host setting on {node.name}: {interserver_host}")
6265

66+
@pytest.fixture(scope="module")
67+
def started_cluster():
68+
try:
69+
cluster.start()
6370
yield cluster
71+
6472
finally:
6573
cluster.shutdown()
6674

6775

6876
def test_replicated_database_uses_interserver_host(started_cluster):
6977
"""Test that DatabaseReplicated uses interserver_http_host for replica registration."""
7078

79+
for node in [node1, node2]:
80+
update_interserver_http_address(node, node.ip_address)
81+
7182
node1.query(
7283
"CREATE DATABASE test_db ENGINE = Replicated('/clickhouse/databases/test_db', 'shard1', 'node1')"
7384
)
@@ -106,3 +117,36 @@ def test_replicated_database_uses_interserver_host(started_cluster):
106117

107118
node1.query("DROP DATABASE test_db SYNC")
108119
node2.query("DROP DATABASE test_db SYNC")
120+
121+
122+
def test_replicated_database_uses_interserver_host_changed(started_cluster):
123+
"""Test that interserver_http_host changed, and Replicated database is still able to be attached after restarting"""
124+
125+
node1.query(
126+
"CREATE DATABASE test_db ENGINE = Replicated('/clickhouse/databases/test_db', 'shard1', 'node1')"
127+
)
128+
node2.query(
129+
"CREATE DATABASE test_db ENGINE = Replicated('/clickhouse/databases/test_db', 'shard1', 'node2')"
130+
)
131+
132+
node1.query("CREATE TABLE test_db.t (x INT, y INT) ENGINE=MergeTree ORDER BY x")
133+
134+
for node in [node1, node2]:
135+
update_interserver_http_address(node, node.name)
136+
137+
node.query("SYSTEM FLUSH LOGS")
138+
assert (
139+
node.query(
140+
"""
141+
SELECT count()
142+
FROM system.text_log
143+
WHERE (level='Error')
144+
AND (logger_name='DatabaseReplicated (test_db)')
145+
AND (message LIKE '%replicated database at /clickhouse/databases/test_db already exists%')
146+
"""
147+
).strip()
148+
== "0"
149+
)
150+
151+
node1.query("DROP DATABASE test_db SYNC")
152+
node2.query("DROP DATABASE test_db SYNC")

0 commit comments

Comments
 (0)