Skip to content
This repository was archived by the owner on Mar 9, 2022. It is now read-only.

Commit df67dff

Browse files
authored
Merge pull request #885 from Random-Liu/enhance-container-stop
Fix an issue that container/sandbox can't be stopped.
2 parents a3af739 + bca304f commit df67dff

2 files changed

Lines changed: 54 additions & 37 deletions

File tree

pkg/server/container_stop.go

Lines changed: 43 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@ import (
3333

3434
// killContainerTimeout is the timeout that we wait for the container to
3535
// be SIGKILLed.
36-
const killContainerTimeout = 2 * time.Minute
36+
// The timeout is set to 1 min, because the default CRI operation timeout
37+
// for StopContainer is (2 min + stop timeout). Set to 1 min, so that we
38+
// have enough time for kill(all=true) and kill(all=false).
39+
const killContainerTimeout = 1 * time.Minute
3740

3841
// StopContainer stops a running container with a grace period (i.e., timeout).
3942
func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) {
@@ -63,6 +66,16 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
6366
return nil
6467
}
6568

69+
task, err := container.Container.Task(ctx, nil)
70+
if err != nil {
71+
if !errdefs.IsNotFound(err) {
72+
return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
73+
}
74+
return nil
75+
}
76+
77+
// We only need to kill the task. The event handler will Delete the
78+
// task from containerd after it handles the Exited event.
6679
if timeout > 0 {
6780
stopSignal := unix.SIGTERM
6881
image, err := c.imageStore.Get(container.ImageRef)
@@ -81,52 +94,47 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
8194
}
8295
}
8396
logrus.Infof("Stop container %q with signal %v", id, stopSignal)
84-
task, err := container.Container.Task(ctx, nil)
85-
if err != nil {
86-
if !errdefs.IsNotFound(err) {
87-
return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
88-
}
89-
return nil
90-
}
91-
if task != nil {
92-
if err = task.Kill(ctx, stopSignal); err != nil {
93-
if !errdefs.IsNotFound(err) {
94-
return errors.Wrapf(err, "failed to stop container %q", id)
95-
}
96-
// Move on to make sure container status is updated.
97-
}
97+
if err = task.Kill(ctx, stopSignal); err != nil && !errdefs.IsNotFound(err) {
98+
return errors.Wrapf(err, "failed to stop container %q", id)
9899
}
99100

100-
err = c.waitContainerStop(ctx, container, timeout)
101-
if err == nil {
101+
if err = c.waitContainerStop(ctx, container, timeout); err == nil {
102102
return nil
103103
}
104-
logrus.WithError(err).Errorf("Stop container %q timed out", id)
104+
logrus.WithError(err).Errorf("An error occurs during waiting for container %q to be stopped", id)
105105
}
106106

107-
task, err := container.Container.Task(ctx, nil)
108-
if err != nil {
109-
if !errdefs.IsNotFound(err) {
110-
return errors.Wrapf(err, "failed to stop container, task not found for container %q", id)
111-
}
107+
logrus.Infof("Kill container %q", id)
108+
if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil && !errdefs.IsNotFound(err) {
109+
return errors.Wrapf(err, "failed to kill container %q", id)
110+
}
111+
112+
// Wait for a fixed timeout until container stop is observed by event monitor.
113+
if err = c.waitContainerStop(ctx, container, killContainerTimeout); err == nil {
112114
return nil
113115
}
114-
// Event handler will Delete the container from containerd after it handles the Exited event.
115-
logrus.Infof("Kill container %q", id)
116-
if task != nil {
117-
if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil {
118-
if !errdefs.IsNotFound(err) {
119-
return errors.Wrapf(err, "failed to kill container %q", id)
120-
}
121-
// Move on to make sure container status is updated.
122-
}
116+
logrus.WithError(err).Errorf("An error occurs during waiting for container %q to be killed", id)
117+
118+
// This is a fix for `runc`, and should not break other runtimes. With
119+
// containerd.WithKillAll, `runc` will get all processes from the container
120+
// cgroups, and kill them. However, sometimes the processes may be moved
121+
// out from the container cgroup, e.g. users manually move them by mistake,
122+
// or systemd.Delegate=true is not set.
123+
// In these cases, we should try our best to do cleanup, kill the container
124+
// without containerd.WithKillAll, so that runc can kill the container init
125+
// process directly.
126+
// NOTE(random-liu): If pid namespace is shared inside the pod, non-init processes
127+
// of this container will be left running until the pause container is stopped.
128+
logrus.Infof("Kill container %q init process", id)
129+
if err = task.Kill(ctx, unix.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
130+
return errors.Wrapf(err, "failed to kill container %q init process", id)
123131
}
124132

125133
// Wait for a fixed timeout until container stop is observed by event monitor.
126-
if err := c.waitContainerStop(ctx, container, killContainerTimeout); err != nil {
127-
return errors.Wrapf(err, "an error occurs during waiting for container %q to stop", id)
134+
if err = c.waitContainerStop(ctx, container, killContainerTimeout); err == nil {
135+
return nil
128136
}
129-
return nil
137+
return errors.Wrapf(err, "an error occurs during waiting for container %q init process to be killed", id)
130138
}
131139

132140
// waitContainerStop waits for container to be stopped until timeout exceeds or context is cancelled.

pkg/server/sandbox_stop.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,20 @@ func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxst
109109
}
110110

111111
// Kill the sandbox container.
112-
err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll)
113-
if err != nil && !errdefs.IsNotFound(err) {
112+
if err = task.Kill(ctx, unix.SIGKILL, containerd.WithKillAll); err != nil && !errdefs.IsNotFound(err) {
114113
return errors.Wrap(err, "failed to kill sandbox container")
115114
}
116115

116+
if err = c.waitSandboxStop(ctx, sandbox, killContainerTimeout); err == nil {
117+
return nil
118+
}
119+
logrus.WithError(err).Errorf("An error occurs during waiting for sandbox %q to be killed", sandbox.ID)
120+
121+
// Kill the sandbox container init process.
122+
if err = task.Kill(ctx, unix.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
123+
return errors.Wrap(err, "failed to kill sandbox container init process")
124+
}
125+
117126
return c.waitSandboxStop(ctx, sandbox, killContainerTimeout)
118127
}
119128

0 commit comments

Comments
 (0)