@@ -33,7 +33,10 @@ import (
3333
3434// killContainerTimeout is the timeout that we wait for the container to
3535// be SIGKILLed.
36- const killContainerTimeout = 2 * time .Minute
36+ // The timeout is set to 1 min, because the default CRI operation timeout
37+ // for StopContainer is (2 min + stop timeout). Set to 1 min, so that we
38+ // have enough time for kill(all=true) and kill(all=false).
39+ const killContainerTimeout = 1 * time .Minute
3740
3841// StopContainer stops a running container with a grace period (i.e., timeout).
3942func (c * criService ) StopContainer (ctx context.Context , r * runtime.StopContainerRequest ) (* runtime.StopContainerResponse , error ) {
@@ -63,6 +66,16 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
6366 return nil
6467 }
6568
69+ task , err := container .Container .Task (ctx , nil )
70+ if err != nil {
71+ if ! errdefs .IsNotFound (err ) {
72+ return errors .Wrapf (err , "failed to stop container, task not found for container %q" , id )
73+ }
74+ return nil
75+ }
76+
77+ // We only need to kill the task. The event handler will Delete the
78+ // task from containerd after it handles the Exited event.
6679 if timeout > 0 {
6780 stopSignal := unix .SIGTERM
6881 image , err := c .imageStore .Get (container .ImageRef )
@@ -81,52 +94,47 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
8194 }
8295 }
8396 logrus .Infof ("Stop container %q with signal %v" , id , stopSignal )
84- task , err := container .Container .Task (ctx , nil )
85- if err != nil {
86- if ! errdefs .IsNotFound (err ) {
87- return errors .Wrapf (err , "failed to stop container, task not found for container %q" , id )
88- }
89- return nil
90- }
91- if task != nil {
92- if err = task .Kill (ctx , stopSignal ); err != nil {
93- if ! errdefs .IsNotFound (err ) {
94- return errors .Wrapf (err , "failed to stop container %q" , id )
95- }
96- // Move on to make sure container status is updated.
97- }
97+ if err = task .Kill (ctx , stopSignal ); err != nil && ! errdefs .IsNotFound (err ) {
98+ return errors .Wrapf (err , "failed to stop container %q" , id )
9899 }
99100
100- err = c .waitContainerStop (ctx , container , timeout )
101- if err == nil {
101+ if err = c .waitContainerStop (ctx , container , timeout ); err == nil {
102102 return nil
103103 }
104- logrus .WithError (err ).Errorf ("Stop container %q timed out " , id )
104+ logrus .WithError (err ).Errorf ("An error occurs during waiting for container %q to be stopped " , id )
105105 }
106106
107- task , err := container .Container .Task (ctx , nil )
108- if err != nil {
109- if ! errdefs .IsNotFound (err ) {
110- return errors .Wrapf (err , "failed to stop container, task not found for container %q" , id )
111- }
107+ logrus .Infof ("Kill container %q" , id )
108+ if err = task .Kill (ctx , unix .SIGKILL , containerd .WithKillAll ); err != nil && ! errdefs .IsNotFound (err ) {
109+ return errors .Wrapf (err , "failed to kill container %q" , id )
110+ }
111+
112+ // Wait for a fixed timeout until container stop is observed by event monitor.
113+ if err = c .waitContainerStop (ctx , container , killContainerTimeout ); err == nil {
112114 return nil
113115 }
114- // Event handler will Delete the container from containerd after it handles the Exited event.
115- logrus .Infof ("Kill container %q" , id )
116- if task != nil {
117- if err = task .Kill (ctx , unix .SIGKILL , containerd .WithKillAll ); err != nil {
118- if ! errdefs .IsNotFound (err ) {
119- return errors .Wrapf (err , "failed to kill container %q" , id )
120- }
121- // Move on to make sure container status is updated.
122- }
116+ logrus .WithError (err ).Errorf ("An error occurs during waiting for container %q to be killed" , id )
117+
118+ // This is a fix for `runc`, and should not break other runtimes. With
119+ // containerd.WithKillAll, `runc` will get all processes from the container
120+ // cgroups, and kill them. However, sometimes the processes may be moved
121+ // out from the container cgroup, e.g. users manually move them by mistake,
122+ // or systemd.Delegate=true is not set.
123+ // In these cases, we should try our best to do cleanup, kill the container
124+ // without containerd.WithKillAll, so that runc can kill the container init
125+ // process directly.
126+ // NOTE(random-liu): If pid namespace is shared inside the pod, non-init processes
127+ // of this container will be left running until the pause container is stopped.
128+ logrus .Infof ("Kill container %q init process" , id )
129+ if err = task .Kill (ctx , unix .SIGKILL ); err != nil && ! errdefs .IsNotFound (err ) {
130+ return errors .Wrapf (err , "failed to kill container %q init process" , id )
123131 }
124132
125133 // Wait for a fixed timeout until container stop is observed by event monitor.
126- if err : = c .waitContainerStop (ctx , container , killContainerTimeout ); err ! = nil {
127- return errors . Wrapf ( err , "an error occurs during waiting for container %q to stop" , id )
134+ if err = c .waitContainerStop (ctx , container , killContainerTimeout ); err = = nil {
135+ return nil
128136 }
129- return nil
137+ return errors . Wrapf ( err , "an error occurs during waiting for container %q init process to be killed" , id )
130138}
131139
132140// waitContainerStop waits for container to be stopped until timeout exceeds or context is cancelled.
0 commit comments