Skip to content

Commit 08712e9

Browse files
snowphtuch
authored andcommitted
cluster: allow ignoring health check status during eds removal (#3302)
This PR adds a configuration flag that allows disabling the "eventually consistent" aspect of endpoint updates: instead of waiting for the endpoints to go unhealthy before removing them from the cluster, do so immediately. This gives greater control to the control plane in cases where one might want to divert traffic from an endpoint without having it go unhealthy. The flag goes on the cluster and so applies to all endpoints within that cluster. Risk Level: Low: Small configurable feature which reuses existing behavior (identical to the behavior when no health checker is configured). Defaults to disabled, so should have no impact on existing configurations. Testing: Added unit test for the case when endpoints are healthy then removed from the ClusterLoadAssignment in a subsequent config update. Docs Changes: Docs were added to the proto field. Release Notes: Added cluster: Add :ref:`option <envoy_api_field_Clister.drain_connections_on_eds_removal>` to drain endpoints after they are removed through EDS, despite health status. to the release notes. [Optional Fixes #Issue] #440 and #3276 (note that this last issue also asks for more fine grained control over endpoint removal. The solution this PR provides was brought up as a partial solution to #3276). Signed-off-by: Snow Pettersen <[email protected]>
1 parent 9851e3c commit 08712e9

File tree

9 files changed

+151
-9
lines changed

9 files changed

+151
-9
lines changed

api/envoy/api/v2/cds.proto

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,14 @@ message Cluster {
442442
// to an upstream host that becomes unhealthy, Envoy may spend a substantial amount of
443443
// time exclusively closing these connections, and not processing any other traffic.
444444
bool close_connections_on_host_health_failure = 31;
445+
446+
// If this cluster uses EDS or STRICT_DNS to configure its hosts, immediately drain
447+
// connections from any hosts that are removed from service discovery.
448+
//
449+
// This only affects behavior for hosts that are being actively health checked.
450+
// If this flag is not set to true, Envoy will wait until the hosts fail active health
451+
// checking before removing it from the cluster.
452+
bool drain_connections_on_host_removal = 32;
445453
}
446454

447455
// An extensible structure containing the address Envoy should bind to when

docs/root/intro/version_history.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ Version history
2424
* cli: added --config-yaml flag to the Envoy binary. When set its value is interpreted as a yaml
2525
representation of the bootstrap config and overrides --config-path.
2626
* cluster: Add :ref:`option <envoy_api_field_Cluster.close_connections_on_host_health_failure>`
27-
to close tcp_proxy upstream connections when health checks fail.
27+
* cluster: Add :ref:`option <envoy_api_field_Cluster.drain_connections_on_host_removal>` to drain
28+
connections from hosts after they are removed from service discovery, regardless of health status.
2829
* health check: added ability to set :ref:`additional HTTP headers
2930
<envoy_api_field_core.HealthCheck.HttpHealthCheck.request_headers_to_add>` for HTTP health check.
3031
* health check: added support for EDS delivered :ref:`endpoint health status

include/envoy/upstream/upstream.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,12 @@ class ClusterInfo {
528528
* connections for this cluster.
529529
*/
530530
virtual const Network::ConnectionSocket::OptionsSharedPtr& clusterSocketOptions() const PURE;
531+
532+
/**
533+
* @return whether to skip waiting for health checking before draining connections
534+
* after a host is removed from service discovery.
535+
*/
536+
virtual bool drainConnectionsOnHostRemoval() const PURE;
531537
};
532538

533539
typedef std::shared_ptr<const ClusterInfo> ClusterInfoConstSharedPtr;

source/common/upstream/eds.cc

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,7 @@ bool EdsClusterImpl::updateHostsPerLocality(HostSet& host_set, const HostVector&
129129
// out of the locality scheduler, we discover their new weights. We don't currently have a shared
130130
// object for locality weights that we can update here, we should add something like this to
131131
// improve performance and scalability of locality weight updates.
132-
if (updateDynamicHostList(new_hosts, *current_hosts_copy, hosts_added, hosts_removed,
133-
health_checker_ != nullptr) ||
132+
if (updateDynamicHostList(new_hosts, *current_hosts_copy, hosts_added, hosts_removed) ||
134133
locality_weights_map != new_locality_weights_map) {
135134
locality_weights_map = new_locality_weights_map;
136135
LocalityWeightsSharedPtr locality_weights;

source/common/upstream/upstream_impl.cc

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,8 @@ ClusterInfoImpl::ClusterInfoImpl(const envoy::api::v2::Cluster& config,
269269
ssl_context_manager_(ssl_context_manager), added_via_api_(added_via_api),
270270
lb_subset_(LoadBalancerSubsetInfoImpl(config.lb_subset_config())),
271271
metadata_(config.metadata()), common_lb_config_(config.common_lb_config()),
272-
cluster_socket_options_(parseClusterSocketOptions(config, bind_config)) {
272+
cluster_socket_options_(parseClusterSocketOptions(config, bind_config)),
273+
drain_connections_on_host_removal_(config.drain_connections_on_host_removal()) {
273274

274275
// If the cluster doesn't have a transport socket configured, override with the default transport
275276
// socket implementation based on the tls_context. We copy by value first then override if
@@ -659,7 +660,7 @@ void StaticClusterImpl::startPreInit() {
659660
bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts,
660661
HostVector& current_hosts,
661662
HostVector& hosts_added,
662-
HostVector& hosts_removed, bool depend_on_hc) {
663+
HostVector& hosts_removed) {
663664
uint64_t max_host_weight = 1;
664665
// Has the EDS health status changed the health of any endpoint? If so, we
665666
// rebuild the hosts vectors. We only do this if the health status of an
@@ -729,14 +730,16 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts,
729730
hosts_added.push_back(host);
730731

731732
// If we are depending on a health checker, we initialize to unhealthy.
732-
if (depend_on_hc) {
733+
if (health_checker_ != nullptr) {
733734
hosts_added.back()->healthFlagSet(Host::HealthFlag::FAILED_ACTIVE_HC);
734735
}
735736
}
736737
}
737738

739+
const bool dont_remove_healthy_hosts =
740+
health_checker_ != nullptr && !info()->drainConnectionsOnHostRemoval();
738741
// If there are removed hosts, check to see if we should only delete if unhealthy.
739-
if (!current_hosts.empty() && depend_on_hc) {
742+
if (!current_hosts.empty() && dont_remove_healthy_hosts) {
740743
for (auto i = current_hosts.begin(); i != current_hosts.end();) {
741744
if (!(*i)->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC)) {
742745
if ((*i)->weight() > max_host_weight) {
@@ -865,7 +868,7 @@ void StrictDnsClusterImpl::ResolveTarget::startResolve() {
865868

866869
HostVector hosts_added;
867870
HostVector hosts_removed;
868-
if (parent_.updateDynamicHostList(new_hosts, hosts_, hosts_added, hosts_removed, false)) {
871+
if (parent_.updateDynamicHostList(new_hosts, hosts_, hosts_added, hosts_removed)) {
869872
ENVOY_LOG(debug, "DNS hosts have changed for {}", dns_address_);
870873
parent_.updateAllHosts(hosts_added, hosts_removed);
871874
}

source/common/upstream/upstream_impl.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,8 @@ class ClusterInfoImpl : public ClusterInfo,
360360
return cluster_socket_options_;
361361
};
362362

363+
bool drainConnectionsOnHostRemoval() const override { return drain_connections_on_host_removal_; }
364+
363365
private:
364366
struct ResourceManagers {
365367
ResourceManagers(const envoy::api::v2::Cluster& config, Runtime::Loader& runtime,
@@ -398,6 +400,7 @@ class ClusterInfoImpl : public ClusterInfo,
398400
const envoy::api::v2::core::Metadata metadata_;
399401
const envoy::api::v2::Cluster::CommonLbConfig common_lb_config_;
400402
const Network::ConnectionSocket::OptionsSharedPtr cluster_socket_options_;
403+
const bool drain_connections_on_host_removal_;
401404
};
402405

403406
/**
@@ -515,7 +518,7 @@ class BaseDynamicClusterImpl : public ClusterImplBase {
515518
using ClusterImplBase::ClusterImplBase;
516519

517520
bool updateDynamicHostList(const HostVector& new_hosts, HostVector& current_hosts,
518-
HostVector& hosts_added, HostVector& hosts_removed, bool depend_on_hc);
521+
HostVector& hosts_added, HostVector& hosts_removed);
519522
};
520523

521524
/**

test/common/upstream/eds_test.cc

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include <memory>
2+
13
#include "envoy/api/v2/eds.pb.h"
24

35
#include "common/config/utility.h"
@@ -15,6 +17,7 @@
1517

1618
using testing::Return;
1719
using testing::ReturnRef;
20+
using testing::_;
1821

1922
namespace Envoy {
2023
namespace Upstream {
@@ -302,6 +305,72 @@ TEST_F(EdsTest, EndpointHealthStatus) {
302305
}
303306
}
304307

308+
// Validate that onConfigUpdate() removes endpoints that are marked as healthy
309+
// when configured to do so.
310+
TEST_F(EdsTest, EndpointRemoval) {
311+
resetCluster(R"EOF(
312+
name: name
313+
connect_timeout: 0.25s
314+
type: EDS
315+
lb_policy: ROUND_ROBIN
316+
drain_connections_on_host_removal: true
317+
eds_cluster_config:
318+
service_name: fare
319+
eds_config:
320+
api_config_source:
321+
cluster_names:
322+
- eds
323+
refresh_delay: 1s
324+
)EOF");
325+
326+
auto health_checker = std::make_shared<MockHealthChecker>();
327+
EXPECT_CALL(*health_checker, start());
328+
EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_)).Times(2);
329+
cluster_->setHealthChecker(health_checker);
330+
331+
Protobuf::RepeatedPtrField<envoy::api::v2::ClusterLoadAssignment> resources;
332+
auto* cluster_load_assignment = resources.Add();
333+
cluster_load_assignment->set_cluster_name("fare");
334+
335+
auto add_endpoint = [cluster_load_assignment](int port) {
336+
auto* endpoints = cluster_load_assignment->add_endpoints();
337+
338+
auto* socket_address = endpoints->add_lb_endpoints()
339+
->mutable_endpoint()
340+
->mutable_address()
341+
->mutable_socket_address();
342+
socket_address->set_address("1.2.3.4");
343+
socket_address->set_port_value(port);
344+
};
345+
346+
add_endpoint(80);
347+
add_endpoint(81);
348+
349+
VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources));
350+
351+
{
352+
auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
353+
EXPECT_EQ(hosts.size(), 2);
354+
355+
EXPECT_TRUE(hosts[0]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC));
356+
EXPECT_TRUE(hosts[1]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC));
357+
// Mark the hosts as healthy
358+
hosts[0]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC);
359+
hosts[1]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC);
360+
}
361+
362+
// Remove endpoints and add back the port 80 one
363+
cluster_load_assignment->clear_endpoints();
364+
add_endpoint(80);
365+
366+
VERBOSE_EXPECT_NO_THROW(cluster_->onConfigUpdate(resources));
367+
368+
{
369+
auto& hosts = cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
370+
EXPECT_EQ(hosts.size(), 1);
371+
}
372+
}
373+
305374
// Validate that onConfigUpdate() updates the endpoint locality.
306375
TEST_F(EdsTest, EndpointLocality) {
307376
Protobuf::RepeatedPtrField<envoy::api::v2::ClusterLoadAssignment> resources;

test/common/upstream/upstream_impl_test.cc

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,58 @@ TEST(StrictDnsClusterImplTest, Basic) {
307307
EXPECT_CALL(resolver2.active_dns_query_, cancel());
308308
}
309309

310+
// Verifies that host removal works correctly when hosts are being health checked
311+
// but the cluster is configured to always remove hosts
312+
TEST(StrictDnsClusterImplTest, HostRemovalActiveHealthSkipped) {
313+
Stats::IsolatedStoreImpl stats;
314+
Ssl::MockContextManager ssl_context_manager;
315+
auto dns_resolver = std::make_shared<Network::MockDnsResolver>();
316+
NiceMock<Event::MockDispatcher> dispatcher;
317+
NiceMock<Runtime::MockLoader> runtime;
318+
NiceMock<MockClusterManager> cm;
319+
320+
const std::string yaml = R"EOF(
321+
name: name
322+
connect_timeout: 0.25s
323+
type: STRICT_DNS
324+
lb_policy: ROUND_ROBIN
325+
drain_connections_on_host_removal: true
326+
hosts: [{ socket_address: { address: foo.bar.com, port_value: 443 }}]
327+
)EOF";
328+
329+
ResolverData resolver(*dns_resolver, dispatcher);
330+
StrictDnsClusterImpl cluster(parseClusterFromV2Yaml(yaml), runtime, stats, ssl_context_manager,
331+
dns_resolver, cm, dispatcher, false);
332+
std::shared_ptr<MockHealthChecker> health_checker(new MockHealthChecker());
333+
EXPECT_CALL(*health_checker, start());
334+
EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_));
335+
cluster.setHealthChecker(health_checker);
336+
cluster.initialize([&]() -> void {});
337+
338+
EXPECT_CALL(*health_checker, addHostCheckCompleteCb(_));
339+
EXPECT_CALL(*resolver.timer_, enableTimer(_)).Times(2);
340+
resolver.dns_callback_(TestUtility::makeDnsResponse({"127.0.0.1", "127.0.0.2"}));
341+
342+
// Verify that both endpoints are initially marked with FAILED_ACTIVE_HC, then
343+
// clear the flag to simulate that these endpoints have been sucessfully health
344+
// checked.
345+
{
346+
const auto& hosts = cluster.prioritySet().hostSetsPerPriority()[0]->hosts();
347+
EXPECT_EQ(2UL, hosts.size());
348+
349+
for (size_t i = 0; i < hosts.size(); ++i) {
350+
EXPECT_TRUE(hosts[i]->healthFlagGet(Host::HealthFlag::FAILED_ACTIVE_HC));
351+
hosts[i]->healthFlagClear(Host::HealthFlag::FAILED_ACTIVE_HC);
352+
}
353+
}
354+
355+
// Re-resolve the DNS name with only one record
356+
resolver.dns_callback_(TestUtility::makeDnsResponse({"127.0.0.1"}));
357+
358+
const auto& hosts = cluster.prioritySet().hostSetsPerPriority()[0]->hosts();
359+
EXPECT_EQ(1UL, hosts.size());
360+
}
361+
310362
TEST(HostImplTest, HostCluster) {
311363
MockCluster cluster;
312364
HostSharedPtr host = makeTestHost(cluster.info_, "tcp://10.0.0.1:1234", 1);

test/mocks/upstream/cluster_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class MockClusterInfo : public ClusterInfo {
6363
MOCK_CONST_METHOD0(lbSubsetInfo, const LoadBalancerSubsetInfo&());
6464
MOCK_CONST_METHOD0(metadata, const envoy::api::v2::core::Metadata&());
6565
MOCK_CONST_METHOD0(clusterSocketOptions, const Network::ConnectionSocket::OptionsSharedPtr&());
66+
MOCK_CONST_METHOD0(drainConnectionsOnHostRemoval, bool());
6667

6768
std::string name_{"fake_cluster"};
6869
Http::Http2Settings http2_settings_{};

0 commit comments

Comments
 (0)