Skip to content

Commit 729c97c

Browse files
committed
Handle unexpected shim kill events
When a shim process is unexpectedly killed in a way that was not initiated through containerd - containerd reports the pod as not ready but the containers as running. This results in kubelet repeatedly sending container kill requests that fail since containerd cannot connect to the shim. Changes: - In the container exit handler, treat `err: Unavailable` as if the container has already exited out - When attempting to get a connection to the shim, if the controller isn't available assume that the shim has been killed (needs to be done since we have a separate exit handler that cleans up the reference to the shim controller - before kubelet has the chance to call StopPodSandbox) Signed-off-by: Aditya Ramani <[email protected]>
1 parent 82df7d5 commit 729c97c

4 files changed

Lines changed: 17 additions & 7 deletions

File tree

pkg/cri/sbserver/events.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
393393
},
394394
)
395395
if err != nil {
396-
if !errdefs.IsNotFound(err) {
396+
if !errdefs.IsNotFound(err) && !errdefs.IsUnavailable(err) {
397397
return fmt.Errorf("failed to load task for container: %w", err)
398398
}
399399
} else {

plugins/sandbox/controller.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,12 @@ func (c *controllerLocal) Wait(ctx context.Context, sandboxID string) (sandbox.E
262262

263263
func (c *controllerLocal) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
264264
svc, err := c.getSandbox(ctx, sandboxID)
265+
if errdefs.IsNotFound(err) {
266+
return sandbox.ControllerStatus{
267+
SandboxID: sandboxID,
268+
ExitedAt: time.Now(),
269+
}, nil
270+
}
265271
if err != nil {
266272
return sandbox.ControllerStatus{}, err
267273
}
@@ -301,7 +307,7 @@ func (c *controllerLocal) Metrics(ctx context.Context, sandboxID string) (*types
301307
func (c *controllerLocal) getSandbox(ctx context.Context, id string) (runtimeAPI.TTRPCSandboxService, error) {
302308
shim, err := c.shims.Get(ctx, id)
303309
if err != nil {
304-
return nil, errdefs.ErrNotFound
310+
return nil, err
305311
}
306312

307313
return sandbox.NewClient(shim.Client())

services/sandbox/controller_service.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,17 +144,21 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu
144144
if err != nil {
145145
return &api.ControllerStatusResponse{}, errdefs.ToGRPC(err)
146146
}
147+
extra := &anypb.Any{}
148+
if cstatus.Extra != nil {
149+
extra = &anypb.Any{
150+
TypeUrl: cstatus.Extra.GetTypeUrl(),
151+
Value: cstatus.Extra.GetValue(),
152+
}
153+
}
147154
return &api.ControllerStatusResponse{
148155
SandboxID: cstatus.SandboxID,
149156
Pid: cstatus.Pid,
150157
State: cstatus.State,
151158
Info: cstatus.Info,
152159
CreatedAt: protobuf.ToTimestamp(cstatus.CreatedAt),
153160
ExitedAt: protobuf.ToTimestamp(cstatus.ExitedAt),
154-
Extra: &anypb.Any{
155-
TypeUrl: cstatus.Extra.GetTypeUrl(),
156-
Value: cstatus.Extra.GetValue(),
157-
},
161+
Extra: extra,
158162
}, nil
159163
}
160164

services/tasks/local.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ func getProcessState(ctx context.Context, p runtime.Process) (*task.Process, err
311311

312312
state, err := p.State(ctx)
313313
if err != nil {
314-
if errdefs.IsNotFound(err) {
314+
if errdefs.IsNotFound(err) || errdefs.IsUnavailable(err) {
315315
return nil, err
316316
}
317317
log.G(ctx).WithError(err).Errorf("get state for %s", p.ID())

0 commit comments

Comments
 (0)