Skip to content

Commit 2216d3c

Browse files
committed
Add health start interval
This adds an additional interval to be used by healthchecks during the start period. Typically when a container is just starting you want to check if it is ready more quickly than a typical healthcheck might run. Without this users have to balance between running healthchecks to frequently vs taking a very long time to mark a container as healthy for the first time. Signed-off-by: Brian Goff <[email protected]> Signed-off-by: Sebastiaan van Stijn <[email protected]>
1 parent 11c6ec6 commit 2216d3c

8 files changed

Lines changed: 112 additions & 6 deletions

File tree

api/server/router/container/container_routes.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,14 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo
541541
bo.CreateMountpoint = false
542542
}
543543
}
544+
545+
}
546+
547+
if hostConfig != nil && versions.LessThan(version, "1.44") {
548+
if config.Healthcheck != nil {
549+
// StartInterval was added in API 1.44
550+
config.Healthcheck.StartInterval = 0
551+
}
544552
}
545553

546554
if hostConfig != nil && versions.GreaterThanOrEqualTo(version, "1.42") {

api/swagger.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,12 @@ definitions:
804804
1000000 (1 ms). 0 means inherit.
805805
type: "integer"
806806
format: "int64"
807+
StartInterval:
808+
description: |
809+
The time to wait between checks in nanoseconds during the start period.
810+
It should be 0 or at least 1000000 (1 ms). 0 means inherit.
811+
type: "integer"
812+
format: "int64"
807813

808814
Health:
809815
description: |

api/types/container/config.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,10 @@ type HealthConfig struct {
4444
Test []string `json:",omitempty"`
4545

4646
// Zero means to inherit. Durations are expressed as integer nanoseconds.
47-
Interval time.Duration `json:",omitempty"` // Interval is the time to wait between checks.
48-
Timeout time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung.
49-
StartPeriod time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down.
47+
Interval time.Duration `json:",omitempty"` // Interval is the time to wait between checks.
48+
Timeout time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung.
49+
StartPeriod time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down.
50+
StartInterval time.Duration `json:",omitempty"` // The interval to attempt healthchecks at during the start period
5051

5152
// Retries is the number of consecutive failures needed to consider a container as unhealthy.
5253
// Zero means inherit.

client/container_create.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ func (cli *Client) ContainerCreate(ctx context.Context, config *container.Config
2929
if err := cli.NewVersionError("1.41", "specify container image platform"); platform != nil && err != nil {
3030
return response, err
3131
}
32+
if err := cli.NewVersionError("1.44", "specify health-check start interval"); config != nil && config.Healthcheck != nil && config.Healthcheck.StartInterval != 0 && err != nil {
33+
return response, err
34+
}
3235

3336
if hostConfig != nil {
3437
if versions.LessThan(cli.ClientVersion(), "1.25") {

daemon/commit.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ func merge(userConf, imageConf *containertypes.Config) error {
9292
if userConf.Healthcheck.StartPeriod == 0 {
9393
userConf.Healthcheck.StartPeriod = imageConf.Healthcheck.StartPeriod
9494
}
95+
if userConf.Healthcheck.StartInterval == 0 {
96+
userConf.Healthcheck.StartInterval = imageConf.Healthcheck.StartInterval
97+
}
9598
if userConf.Healthcheck.Retries == 0 {
9699
userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries
97100
}

daemon/health.go

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,13 +248,31 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
248248
// There is never more than one monitor thread running per container at a time.
249249
func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
250250
probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
251+
startInterval := timeoutWithDefault(c.Config.Healthcheck.StartInterval, defaultProbeInterval)
252+
startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
251253

252-
intervalTimer := time.NewTimer(probeInterval)
254+
c.Lock()
255+
started := c.State.StartedAt
256+
c.Unlock()
257+
258+
getInterval := func() time.Duration {
259+
if time.Since(started) >= startPeriod {
260+
return probeInterval
261+
}
262+
c.Lock()
263+
status := c.Health.Health.Status
264+
c.Unlock()
265+
266+
if status == types.Starting {
267+
return startInterval
268+
}
269+
return probeInterval
270+
}
271+
272+
intervalTimer := time.NewTimer(getInterval())
253273
defer intervalTimer.Stop()
254274

255275
for {
256-
intervalTimer.Reset(probeInterval)
257-
258276
select {
259277
case <-stop:
260278
log.G(context.TODO()).Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
@@ -296,6 +314,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
296314
cancelProbe()
297315
}
298316
}
317+
intervalTimer.Reset(getInterval())
299318
}
300319
}
301320

docs/api/version-history.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ keywords: "API, Docker, rcli, REST, documentation"
2424
with runtimes which support the feature.
2525
`POST /containers/create`, `GET /containers/{id}/json`, and `GET /containers/json` now supports
2626
`BindOptions.ReadOnlyNonRecursive` and `BindOptions.ReadOnlyForceRecursive` to customize the behavior.
27+
* `POST /containers/create` now accepts a `HealthConfig.StartInterval` to set the
28+
interval for health checks during the start period.
2729

2830
## v1.43 API changes
2931

integration/container/health_test.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,70 @@ func TestHealthCheckProcessKilled(t *testing.T) {
111111
poll.WaitOn(t, pollForHealthCheckLog(ctx, apiClient, cID, "Health check exceeded timeout (50ms): logs logs logs\n"))
112112
}
113113

114+
func TestHealthStartInterval(t *testing.T) {
115+
skip.If(t, testEnv.DaemonInfo.OSType == "windows", "The shell commands used in the test healthcheck do not work on Windows")
116+
defer setupTest(t)()
117+
ctx := context.Background()
118+
client := testEnv.APIClient()
119+
120+
// Note: Windows is much slower than linux so this use longer intervals/timeouts
121+
id := container.Run(ctx, t, client, func(c *container.TestContainerConfig) {
122+
c.Config.Healthcheck = &containertypes.HealthConfig{
123+
Test: []string{"CMD-SHELL", `count="$(cat /tmp/health)"; if [ -z "${count}" ]; then let count=0; fi; let count=${count}+1; echo -n ${count} | tee /tmp/health; if [ ${count} -lt 3 ]; then exit 1; fi`},
124+
Interval: 30 * time.Second,
125+
StartInterval: time.Second,
126+
StartPeriod: 30 * time.Second,
127+
}
128+
})
129+
130+
ctxPoll, cancel := context.WithTimeout(ctx, 30*time.Second)
131+
defer cancel()
132+
133+
dl, _ := ctxPoll.Deadline()
134+
135+
poll.WaitOn(t, func(log poll.LogT) poll.Result {
136+
if ctxPoll.Err() != nil {
137+
return poll.Error(ctxPoll.Err())
138+
}
139+
inspect, err := client.ContainerInspect(ctxPoll, id)
140+
if err != nil {
141+
return poll.Error(err)
142+
}
143+
if inspect.State.Health.Status != "healthy" {
144+
if len(inspect.State.Health.Log) > 0 {
145+
t.Log(inspect.State.Health.Log[len(inspect.State.Health.Log)-1])
146+
}
147+
return poll.Continue("waiting on container to be ready")
148+
}
149+
return poll.Success()
150+
}, poll.WithDelay(100*time.Millisecond), poll.WithTimeout(time.Until(dl)))
151+
cancel()
152+
153+
ctxPoll, cancel = context.WithTimeout(ctx, 2*time.Minute)
154+
defer cancel()
155+
dl, _ = ctxPoll.Deadline()
156+
157+
poll.WaitOn(t, func(log poll.LogT) poll.Result {
158+
inspect, err := client.ContainerInspect(ctxPoll, id)
159+
if err != nil {
160+
return poll.Error(err)
161+
}
162+
163+
hLen := len(inspect.State.Health.Log)
164+
if hLen < 2 {
165+
return poll.Continue("waiting for more healthcheck results")
166+
}
167+
168+
h1 := inspect.State.Health.Log[hLen-1]
169+
h2 := inspect.State.Health.Log[hLen-2]
170+
if h1.Start.Sub(h2.Start) >= inspect.Config.Healthcheck.Interval {
171+
return poll.Success()
172+
}
173+
t.Log(h1.Start.Sub(h2.Start))
174+
return poll.Continue("waiting for health check interval to switch from the start interval")
175+
}, poll.WithDelay(time.Second), poll.WithTimeout(time.Until(dl)))
176+
}
177+
114178
func pollForHealthCheckLog(ctx context.Context, client client.APIClient, containerID string, expected string) func(log poll.LogT) poll.Result {
115179
return func(log poll.LogT) poll.Result {
116180
inspect, err := client.ContainerInspect(ctx, containerID)

0 commit comments

Comments
 (0)