Skip to content

Commit 45fe83e

Browse files
authored
http: per-stream idle timeout. (#3841)
Fixes #1778. Risk level: Medium. A very conservative 5 minute default idle timeout has been set, which should not affect most deployments with default timeout already kicking in for connection idle or upstream idle. This will however affect things like hanging POSTs. Testing: Integration and unit tests added for various timeout scenarios. Signed-off-by: Harvey Tuch <[email protected]>
1 parent e435d18 commit 45fe83e

File tree

15 files changed

+527
-4
lines changed

15 files changed

+527
-4
lines changed

api/envoy/api/v2/route/route.proto

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ message CorsPolicy {
313313
google.protobuf.BoolValue enabled = 7;
314314
}
315315

316-
// [#comment:next free field: 24]
316+
// [#comment:next free field: 25]
317317
message RouteAction {
318318
oneof cluster_specifier {
319319
option (validate.required) = true;
@@ -401,7 +401,9 @@ message RouteAction {
401401
google.protobuf.BoolValue auto_host_rewrite = 7;
402402
}
403403

404-
// Specifies the timeout for the route. If not specified, the default is 15s.
404+
// Specifies the upstream timeout for the route. If not specified, the default is 15s. This
405+
// spans between the point at which the entire downstream request (i.e. end-of-stream) has been
406+
// processed and when the upstream response has been completely processed.
405407
//
406408
// .. note::
407409
//
@@ -423,8 +425,8 @@ message RouteAction {
423425
// :ref:`config_http_filters_router_x-envoy-max-retries`.
424426
google.protobuf.UInt32Value num_retries = 2;
425427

426-
// Specifies a non-zero timeout per retry attempt. This parameter is optional.
427-
// The same conditions documented for
428+
// Specifies a non-zero upstream timeout per retry attempt. This parameter is optional. The
429+
// same conditions documented for
428430
// :ref:`config_http_filters_router_x-envoy-upstream-rq-per-try-timeout-ms` apply.
429431
//
430432
// .. note::
@@ -437,6 +439,28 @@ message RouteAction {
437439
google.protobuf.Duration per_try_timeout = 3 [(gogoproto.stdduration) = true];
438440
}
439441

442+
// Specifies the idle timeout for the route. If not specified, this defaults
443+
// to 5 minutes. The default value was select so as not to interfere with any
444+
// smaller configured timeouts that may have existed in configurations prior
445+
// to the introduction of this feature, while introducing robustness to TCP
446+
// connections that terminate without FIN. A value of 0 will completely
447+
// disable the idle timeout.
448+
//
449+
// The idle timeout is distinct to :ref:`timeout
450+
// <envoy_api_field_route.RouteAction.timeout>`, which provides an upper bound
451+
// on the upstream response time; :ref:`idle_timeout
452+
// <envoy_api_field_route.RouteAction.idle_timeout>` instead bounds the amount
453+
// of time the request's stream may be idle.
454+
//
455+
// After header decoding, the idle timeout will apply on downstream and
456+
// upstream request events. Each time an encode/decode event for headers or
457+
// data is processed for the stream, the timer will be reset. If the timeout
458+
// fires, the stream is terminated with a 408 Request Timeout error code if no
459+
// upstream response header has been received, otherwise a stream reset
460+
// occurs.
461+
google.protobuf.Duration idle_timeout = 24
462+
[(validate.rules).duration.gt = {}, (gogoproto.stdduration) = true];
463+
440464
// Indicates that the route has a retry policy.
441465
RetryPolicy retry_policy = 9;
442466

docs/root/configuration/http_conn_man/stats.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ statistics:
5252
downstream_rq_5xx, Counter, Total 5xx responses
5353
downstream_rq_ws_on_non_ws_route, Counter, Total WebSocket upgrade requests rejected by non WebSocket routes
5454
downstream_rq_time, Histogram, Request time milliseconds
55+
downstream_rq_idle_timeout, Counter, Total requests closed due to idle timeout
5556
rs_too_large, Counter, Total response errors due to buffering an overly large body
5657

5758
Per user agent statistics

docs/root/intro/version_history.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@ Version history
1414
* health check: added support for :ref:`custom health check <envoy_api_field_core.HealthCheck.custom_health_check>`.
1515
* health check: added support for :ref:`specifying jitter as a percentage <envoy_api_field_core.HealthCheck.interval_jitter_percent>`.
1616
* health_check: added support for :ref:`health check event logging <arch_overview_health_check_logging>`.
17+
* http: added support for a :ref:`per-stream idle timeout
18+
<envoy_api_field_route.RouteAction.idle_timeout>`. This defaults to 5 minutes; if you have
19+
other timeouts (e.g. connection idle timeout, upstream response per-retry) that are longer than
20+
this in duration, you may want to consider setting a non-default per-stream idle timeout.
1721
* http: better handling of HEAD requests. Now sending transfer-encoding: chunked rather than content-length: 0.
1822
* http: response filters not applied to early error paths such as http_parser generated 400s.
1923
* proxy_protocol: added support for HAProxy Proxy Protocol v2 (AF_INET/AF_INET6 only).

include/envoy/router/router.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,11 @@ class RouteEntry : public ResponseEntry {
472472
*/
473473
virtual std::chrono::milliseconds timeout() const PURE;
474474

475+
/**
476+
* @return absl::optional<std::chrono::milliseconds> the route's idle timeout.
477+
*/
478+
virtual absl::optional<std::chrono::milliseconds> idleTimeout() const PURE;
479+
475480
/**
476481
* @return absl::optional<std::chrono::milliseconds> the maximum allowed timeout value derived
477482
* from 'grpc-timeout' header of a gRPC request. Non-present value disables use of 'grpc-timeout'

source/common/http/async_client_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ class AsyncStreamImpl : public AsyncClient::Stream,
195195
return std::chrono::milliseconds(0);
196196
}
197197
}
198+
absl::optional<std::chrono::milliseconds> idleTimeout() const override { return absl::nullopt; }
198199
absl::optional<std::chrono::milliseconds> maxGrpcTimeout() const override {
199200
return absl::nullopt;
200201
}

source/common/http/conn_manager_config.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ namespace Http {
5454
COUNTER (downstream_rq_4xx) \
5555
COUNTER (downstream_rq_5xx) \
5656
HISTOGRAM(downstream_rq_time) \
57+
COUNTER (downstream_rq_idle_timeout) \
5758
COUNTER (rs_too_large)
5859
// clang-format on
5960

source/common/http/conn_manager_impl.cc

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,28 @@ ConnectionManagerImpl::ActiveStream::~ActiveStream() {
396396
ASSERT(state_.filter_call_state_ == 0);
397397
}
398398

399+
void ConnectionManagerImpl::ActiveStream::resetIdleTimer() {
400+
if (idle_timer_ != nullptr) {
401+
// TODO(htuch): If this shows up in performance profiles, optimize by only
402+
// updating a timestamp here and doing periodic checks for idle timeouts
403+
// instead, or reducing the accuracy of timers.
404+
idle_timer_->enableTimer(idle_timeout_ms_);
405+
}
406+
}
407+
408+
void ConnectionManagerImpl::ActiveStream::onIdleTimeout() {
409+
connection_manager_.stats_.named_.downstream_rq_idle_timeout_.inc();
410+
// If headers have not been sent to the user, send a 408.
411+
if (response_headers_ != nullptr) {
412+
// TODO(htuch): We could send trailers here with an x-envoy timeout header
413+
// or gRPC status code, and/or set H2 RST_STREAM error.
414+
connection_manager_.doEndStream(*this);
415+
} else {
416+
sendLocalReply(Grpc::Common::hasGrpcContentType(*request_headers_), Http::Code::RequestTimeout,
417+
"stream timeout", nullptr);
418+
}
419+
}
420+
399421
void ConnectionManagerImpl::ActiveStream::addStreamDecoderFilterWorker(
400422
StreamDecoderFilterSharedPtr filter, bool dual_filter) {
401423
ActiveStreamDecoderFilterPtr wrapper(new ActiveStreamDecoderFilter(*this, filter, dual_filter));
@@ -579,6 +601,16 @@ void ConnectionManagerImpl::ActiveStream::decodeHeaders(HeaderMapPtr&& headers,
579601
// Allow non websocket requests to go through websocket enabled routes.
580602
}
581603

604+
if (cached_route_.value()) {
605+
const Router::RouteEntry* route_entry = cached_route_.value()->routeEntry();
606+
if (route_entry != nullptr && route_entry->idleTimeout()) {
607+
idle_timeout_ms_ = route_entry->idleTimeout().value();
608+
idle_timer_ = connection_manager_.read_callbacks_->connection().dispatcher().createTimer(
609+
[this]() -> void { onIdleTimeout(); });
610+
resetIdleTimer();
611+
}
612+
}
613+
582614
// Check if tracing is enabled at all.
583615
if (connection_manager_.config_.tracingConfig()) {
584616
traceRequest();
@@ -702,6 +734,8 @@ void ConnectionManagerImpl::ActiveStream::decodeData(Buffer::Instance& data, boo
702734

703735
void ConnectionManagerImpl::ActiveStream::decodeData(ActiveStreamDecoderFilter* filter,
704736
Buffer::Instance& data, bool end_stream) {
737+
resetIdleTimer();
738+
705739
// If a response is complete or a reset has been sent, filters do not care about further body
706740
// data. Just drop it.
707741
if (state_.local_complete_) {
@@ -750,6 +784,7 @@ void ConnectionManagerImpl::ActiveStream::addDecodedData(ActiveStreamDecoderFilt
750784
}
751785

752786
void ConnectionManagerImpl::ActiveStream::decodeTrailers(HeaderMapPtr&& trailers) {
787+
resetIdleTimer();
753788
maybeEndDecode(true);
754789
request_trailers_ = std::move(trailers);
755790
decodeTrailers(nullptr, *request_trailers_);
@@ -846,6 +881,7 @@ void ConnectionManagerImpl::ActiveStream::sendLocalReply(
846881

847882
void ConnectionManagerImpl::ActiveStream::encode100ContinueHeaders(
848883
ActiveStreamEncoderFilter* filter, HeaderMap& headers) {
884+
resetIdleTimer();
849885
ASSERT(connection_manager_.config_.proxy100Continue());
850886
// Make sure commonContinue continues encode100ContinueHeaders.
851887
has_continue_headers_ = true;
@@ -882,6 +918,8 @@ void ConnectionManagerImpl::ActiveStream::encode100ContinueHeaders(
882918

883919
void ConnectionManagerImpl::ActiveStream::encodeHeaders(ActiveStreamEncoderFilter* filter,
884920
HeaderMap& headers, bool end_stream) {
921+
resetIdleTimer();
922+
885923
std::list<ActiveStreamEncoderFilterPtr>::iterator entry = commonEncodePrefix(filter, end_stream);
886924
std::list<ActiveStreamEncoderFilterPtr>::iterator continue_data_entry = encoder_filters_.end();
887925

@@ -1019,6 +1057,7 @@ void ConnectionManagerImpl::ActiveStream::addEncodedData(ActiveStreamEncoderFilt
10191057

10201058
void ConnectionManagerImpl::ActiveStream::encodeData(ActiveStreamEncoderFilter* filter,
10211059
Buffer::Instance& data, bool end_stream) {
1060+
resetIdleTimer();
10221061
std::list<ActiveStreamEncoderFilterPtr>::iterator entry = commonEncodePrefix(filter, end_stream);
10231062
for (; entry != encoder_filters_.end(); entry++) {
10241063
ASSERT(!(state_.filter_call_state_ & FilterCallState::EncodeData));
@@ -1042,6 +1081,7 @@ void ConnectionManagerImpl::ActiveStream::encodeData(ActiveStreamEncoderFilter*
10421081

10431082
void ConnectionManagerImpl::ActiveStream::encodeTrailers(ActiveStreamEncoderFilter* filter,
10441083
HeaderMap& trailers) {
1084+
resetIdleTimer();
10451085
std::list<ActiveStreamEncoderFilterPtr>::iterator entry = commonEncodePrefix(filter, true);
10461086
for (; entry != encoder_filters_.end(); entry++) {
10471087
ASSERT(!(state_.filter_call_state_ & FilterCallState::EncodeTrailers));

source/common/http/conn_manager_impl.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,10 @@ class ConnectionManagerImpl : Logger::Loggable<Logger::Id::http>,
362362
void setBufferLimit(uint32_t limit);
363363
// Set up the Encoder/Decoder filter chain.
364364
bool createFilterChain();
365+
// Per-stream idle timeout callback.
366+
void onIdleTimeout();
367+
// Reset per-stream idle timer.
368+
void resetIdleTimer();
365369

366370
ConnectionManagerImpl& connection_manager_;
367371
Router::ConfigConstSharedPtr snapped_route_config_;
@@ -379,6 +383,9 @@ class ConnectionManagerImpl : Logger::Loggable<Logger::Id::http>,
379383
std::list<ActiveStreamEncoderFilterPtr> encoder_filters_;
380384
std::list<AccessLog::InstanceSharedPtr> access_log_handlers_;
381385
Stats::TimespanPtr request_timer_;
386+
// Per-stream idle timeout.
387+
Event::TimerPtr idle_timer_;
388+
std::chrono::milliseconds idle_timeout_ms_{};
382389
State state_;
383390
RequestInfo::RequestInfoImpl request_info_;
384391
absl::optional<Router::RouteConstSharedPtr> cached_route_;

source/common/router/config_impl.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,8 @@ RouteEntryImplBase::RouteEntryImplBase(const VirtualHostImpl& vhost,
258258
cluster_not_found_response_code_(ConfigUtility::parseClusterNotFoundResponseCode(
259259
route.route().cluster_not_found_response_code())),
260260
timeout_(PROTOBUF_GET_MS_OR_DEFAULT(route.route(), timeout, DEFAULT_ROUTE_TIMEOUT_MS)),
261+
idle_timeout_(
262+
PROTOBUF_GET_MS_OR_DEFAULT(route.route(), idle_timeout, DEFAULT_ROUTE_IDLE_TIMEOUT_MS)),
261263
max_grpc_timeout_(PROTOBUF_GET_OPTIONAL_MS(route.route(), max_grpc_timeout)),
262264
runtime_(loadRuntimeData(route.match())), loader_(factory_context.runtime()),
263265
host_redirect_(route.redirect().host_redirect()),

source/common/router/config_impl.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,10 @@ class RouteEntryImplBase : public RouteEntry,
311311
return vhost_.virtualClusterFromEntries(headers);
312312
}
313313
std::chrono::milliseconds timeout() const override { return timeout_; }
314+
absl::optional<std::chrono::milliseconds> idleTimeout() const override {
315+
return idle_timeout_.count() == 0 ? absl::nullopt
316+
: absl::optional<std::chrono::milliseconds>(idle_timeout_);
317+
}
314318
absl::optional<std::chrono::milliseconds> maxGrpcTimeout() const override {
315319
return max_grpc_timeout_;
316320
}
@@ -395,6 +399,9 @@ class RouteEntryImplBase : public RouteEntry,
395399
const RetryPolicy& retryPolicy() const override { return parent_->retryPolicy(); }
396400
const ShadowPolicy& shadowPolicy() const override { return parent_->shadowPolicy(); }
397401
std::chrono::milliseconds timeout() const override { return parent_->timeout(); }
402+
absl::optional<std::chrono::milliseconds> idleTimeout() const override {
403+
return parent_->idleTimeout();
404+
}
398405
absl::optional<std::chrono::milliseconds> maxGrpcTimeout() const override {
399406
return parent_->maxGrpcTimeout();
400407
}
@@ -503,6 +510,9 @@ class RouteEntryImplBase : public RouteEntry,
503510
// Default timeout is 15s if nothing is specified in the route config.
504511
static const uint64_t DEFAULT_ROUTE_TIMEOUT_MS = 15000;
505512

513+
// Default idle timeout is 5 minutes if nothing is specified in the route config.
514+
static const uint64_t DEFAULT_ROUTE_IDLE_TIMEOUT_MS = 5 * 60 * 1000;
515+
506516
std::unique_ptr<const CorsPolicyImpl> cors_policy_;
507517
const VirtualHostImpl& vhost_; // See note in RouteEntryImplBase::clusterEntry() on why raw ref
508518
// to virtual host is currently safe.
@@ -512,6 +522,7 @@ class RouteEntryImplBase : public RouteEntry,
512522
const Http::LowerCaseString cluster_header_name_;
513523
const Http::Code cluster_not_found_response_code_;
514524
const std::chrono::milliseconds timeout_;
525+
const std::chrono::milliseconds idle_timeout_;
515526
const absl::optional<std::chrono::milliseconds> max_grpc_timeout_;
516527
const absl::optional<RuntimeData> runtime_;
517528
Runtime::Loader& loader_;

0 commit comments

Comments
 (0)