Skip to content

Commit 3d27bc7

Browse files
adityaramanidcantah
authored andcommitted
Handle unexpected shim kill events
When a shim process is unexpectedly killed in a way that was not initiated through containerd - containerd reports the pod as not ready but the containers as running. This results in kubelet repeatedly sending container kill requests that fail since containerd cannot connect to the shim. Changes: - In the container exit handler, treat `err: Unavailable` as if the container has already exited out - When attempting to get a connection to the shim, if the controller isn't available assume that the shim has been killed (needs to be done since we have a separate exit handler that cleans up the reference to the shim controller - before kubelet has the chance to call StopPodSandbox) Signed-off-by: Aditya Ramani <[email protected]> (cherry picked from commit 729c97c) Signed-off-by: Danny Canter <[email protected]>
1 parent 4093cfc commit 3d27bc7

4 files changed

Lines changed: 17 additions & 7 deletions

File tree

pkg/cri/sbserver/events.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
393393
},
394394
)
395395
if err != nil {
396-
if !errdefs.IsNotFound(err) {
396+
if !errdefs.IsNotFound(err) && !errdefs.IsUnavailable(err) {
397397
return fmt.Errorf("failed to load task for container: %w", err)
398398
}
399399
} else {

plugins/sandbox/controller.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,12 @@ func (c *controllerLocal) Wait(ctx context.Context, sandboxID string) (sandbox.E
260260

261261
func (c *controllerLocal) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
262262
svc, err := c.getSandbox(ctx, sandboxID)
263+
if errdefs.IsNotFound(err) {
264+
return sandbox.ControllerStatus{
265+
SandboxID: sandboxID,
266+
ExitedAt: time.Now(),
267+
}, nil
268+
}
263269
if err != nil {
264270
return sandbox.ControllerStatus{}, err
265271
}
@@ -286,7 +292,7 @@ func (c *controllerLocal) Status(ctx context.Context, sandboxID string, verbose
286292
func (c *controllerLocal) getSandbox(ctx context.Context, id string) (runtimeAPI.TTRPCSandboxService, error) {
287293
shim, err := c.shims.Get(ctx, id)
288294
if err != nil {
289-
return nil, errdefs.ErrNotFound
295+
return nil, err
290296
}
291297

292298
return sandbox.NewClient(shim.Client())

services/sandbox/controller_service.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,21 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu
144144
if err != nil {
145145
return &api.ControllerStatusResponse{}, errdefs.ToGRPC(err)
146146
}
147+
extra := &anypb.Any{}
148+
if cstatus.Extra != nil {
149+
extra = &anypb.Any{
150+
TypeUrl: cstatus.Extra.GetTypeUrl(),
151+
Value: cstatus.Extra.GetValue(),
152+
}
153+
}
147154
return &api.ControllerStatusResponse{
148155
SandboxID: cstatus.SandboxID,
149156
Pid: cstatus.Pid,
150157
State: cstatus.State,
151158
Info: cstatus.Info,
152159
CreatedAt: protobuf.ToTimestamp(cstatus.CreatedAt),
153160
ExitedAt: protobuf.ToTimestamp(cstatus.ExitedAt),
154-
Extra: &anypb.Any{
155-
TypeUrl: cstatus.Extra.GetTypeUrl(),
156-
Value: cstatus.Extra.GetValue(),
157-
},
161+
Extra: extra,
158162
}, nil
159163
}
160164

services/tasks/local.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ func getProcessState(ctx context.Context, p runtime.Process) (*task.Process, err
339339

340340
state, err := p.State(ctx)
341341
if err != nil {
342-
if errdefs.IsNotFound(err) {
342+
if errdefs.IsNotFound(err) || errdefs.IsUnavailable(err) {
343343
return nil, err
344344
}
345345
log.G(ctx).WithError(err).Errorf("get state for %s", p.ID())

0 commit comments

Comments
 (0)