Skip to content

Commit f8e83e6

Browse files
authored
Terminate running containers when CRI restarts (containerd#107)
* Terminate running containers when CRI restarts Currently if any containers are still running when CRI restarts, we are unable to reconnect to their IO pipes. Due to this, the containers can get stuck if they do blocking IO operations, as the pipes are not being drained by CRI anymore. There is longer-term work to reconnect to the IO pipes, but for the meantime, this change makes CRI terminate any running containers in two cases: - When CRI gracefully shuts down - When CRI starts up, as it rediscovers running containers This change also improves logging a little in the restart logic. Signed-off-by: Kevin Parsons <[email protected]> * Use correct context type Signed-off-by: Kevin Parsons <[email protected]>
1 parent c738ca4 commit f8e83e6

3 files changed

Lines changed: 31 additions & 5 deletions

File tree

pkg/config/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ type PluginConfig struct {
163163
RestrictOOMScoreAdj bool `toml:"restrict_oom_score_adj" json:"restrictOOMScoreAdj"`
164164
// Sets GODEBUG=http2client=0 if enabled.
165165
DisableHTTP2Client bool `toml:"disable_http2_client" json:"disableHTTP2Client"`
166+
// Determines whether any running containers should be terminated when CRI shuts down or starts up.
167+
TerminateContainersOnRestart bool `toml:"terminate_containers_on_restart" json:"terminateContainersOnRestart"`
166168
}
167169

168170
// X509KeyPairStreaming contains the x509 configuration for streaming

pkg/server/restart.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,14 @@ func (c *criService) recover(ctx context.Context) error {
5858
return errors.Wrap(err, "failed to list sandbox containers")
5959
}
6060
for _, sandbox := range sandboxes {
61+
l := log.G(ctx).WithField("containerID", sandbox.ID())
62+
l.Debug("Loading sandbox")
6163
sb, err := c.loadSandbox(ctx, sandbox)
6264
if err != nil {
63-
log.G(ctx).WithError(err).Errorf("Failed to load sandbox %q", sandbox.ID())
65+
l.WithError(err).Errorf("Failed to load sandbox")
6466
continue
6567
}
66-
log.G(ctx).Debugf("Loaded sandbox %+v", sb)
68+
l.Debugf("Loaded sandbox")
6769
if err := c.sandboxStore.Add(sb); err != nil {
6870
return errors.Wrapf(err, "failed to add sandbox %q to store", sandbox.ID())
6971
}
@@ -78,12 +80,14 @@ func (c *criService) recover(ctx context.Context) error {
7880
return errors.Wrap(err, "failed to list containers")
7981
}
8082
for _, container := range containers {
83+
l := log.G(ctx).WithField("containerID", container.ID())
84+
l.Debug("Loading container")
8185
cntr, err := c.loadContainer(ctx, container)
8286
if err != nil {
83-
log.G(ctx).WithError(err).Errorf("Failed to load container %q", container.ID())
87+
l.WithError(err).Errorf("Failed to load container")
8488
continue
8589
}
86-
log.G(ctx).Debugf("Loaded container %+v", cntr)
90+
l.Debugf("Loaded container")
8791
if err := c.containerStore.Add(cntr); err != nil {
8892
return errors.Wrapf(err, "failed to add container %q to store", container.ID())
8993
}
@@ -251,6 +255,11 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe
251255
// Container is in exited/unknown state, return the status as it is.
252256
}
253257
} else {
258+
// If we need to terminate any running containers, mark it as stopped here.
259+
// This will cause it to be stopped via WithProcessKill in the switch case below.
260+
if c.config.TerminateContainersOnRestart {
261+
s.Status = containerd.Stopped
262+
}
254263
// Task status is found. Update container status based on the up-to-date task status.
255264
switch s.Status {
256265
case containerd.Created:
@@ -293,6 +302,7 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe
293302
c.eventMonitor.startExitMonitor(context.Background(), id, status.Pid, exitCh)
294303
}
295304
case containerd.Stopped:
305+
log.G(ctx).WithField("containerID", cntr.ID()).Info("Deleting dead container task")
296306
// Task is stopped. Updata status and delete the task.
297307
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
298308
return errors.Wrap(err, "failed to delete task")
@@ -374,7 +384,9 @@ func (c *criService) loadSandbox(ctx context.Context, cntr containerd.Container)
374384
// Task does not exist, set sandbox state as NOTREADY.
375385
status.State = sandboxstore.StateNotReady
376386
} else {
377-
if taskStatus.Status == containerd.Running {
387+
// If we need to terminate running containers, treat it as stopped, so the else condition
388+
// will clean it up via WithProcessKill.
389+
if taskStatus.Status == containerd.Running && !c.config.TerminateContainersOnRestart {
378390
// Wait for the task for sandbox monitor.
379391
// wait is a long running background request, no timeout needed.
380392
exitCh, err := t.Wait(ctrdutil.NamespacedContext())
@@ -390,6 +402,7 @@ func (c *criService) loadSandbox(ctx context.Context, cntr containerd.Container)
390402
c.eventMonitor.startExitMonitor(context.Background(), meta.ID, status.Pid, exitCh)
391403
}
392404
} else {
405+
log.G(ctx).WithField("sandboxID", cntr.ID()).Info("Deleting dead sandbox task")
393406
// Task is not running. Delete the task and set sandbox state as NOTREADY.
394407
if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
395408
return status, errors.Wrap(err, "failed to delete task")

pkg/server/service.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package server
1818

1919
import (
20+
"context"
2021
"fmt"
2122
"io"
2223
"net/http"
@@ -26,6 +27,7 @@ import (
2627
"time"
2728

2829
"github.com/containerd/containerd"
30+
"github.com/containerd/containerd/log"
2931
"github.com/containerd/containerd/plugin"
3032
cni "github.com/containerd/go-cni"
3133
"github.com/pkg/errors"
@@ -272,6 +274,15 @@ func (c *criService) Run() error {
272274
// TODO(random-liu): Make close synchronous.
273275
func (c *criService) Close() error {
274276
logrus.Info("Stop CRI service")
277+
if c.config.TerminateContainersOnRestart {
278+
// We need to stop any running containers. Do this by stopping all pods.
279+
ctx := context.Background()
280+
for _, sandbox := range c.sandboxStore.List() {
281+
if err := c.stopPodSandbox(ctx, sandbox); err != nil {
282+
log.G(ctx).WithField("sandboxID", sandbox.Metadata.ID).Error("Failed to stop sandbox on shutdown")
283+
}
284+
}
285+
}
275286
c.eventMonitor.stop()
276287
if err := c.streamServer.Stop(); err != nil {
277288
return errors.Wrap(err, "failed to stop stream server")

0 commit comments

Comments
 (0)