Skip to content
This repository was archived by the owner on Mar 9, 2022. It is now read-only.

Commit bf62320

Browse files
committed
Add timeout for container/sandbox recover and event monitor.
Signed-off-by: Lantao Liu <[email protected]>
1 parent 79645ed commit bf62320

File tree

3 files changed

+32
-6
lines changed

3 files changed

+32
-6
lines changed

pkg/server/events.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,17 @@ const (
4242
backOffInitDuration = 1 * time.Second
4343
backOffMaxDuration = 5 * time.Minute
4444
backOffExpireCheckDuration = 1 * time.Second
45+
46+
// handleEventTimeout is the timeout for handling 1 event. Event monitor
47+
// handles events in serial, if one event blocks the event monitor, no
48+
// other events can be handled.
49+
// Add a timeout for each event handling, events that timeout will be requeued and
50+
// handled again in the future.
51+
handleEventTimeout = 10 * time.Second
4552
)
4653

4754
// eventMonitor monitors containerd event and updates internal state correspondingly.
48-
// TODO(random-liu): [P1] Figure out is it possible to drop event during containerd
49-
// is running. If it is, we should do periodically list to sync state with containerd.
55+
// TODO(random-liu): Handle event for each container in a separate goroutine.
5056
type eventMonitor struct {
5157
containerStore *containerstore.Store
5258
sandboxStore *sandboxstore.Store
@@ -189,6 +195,9 @@ func (em *eventMonitor) stop() {
189195
// handleEvent handles a containerd event.
190196
func (em *eventMonitor) handleEvent(any interface{}) error {
191197
ctx := ctrdutil.NamespacedContext()
198+
ctx, cancel := context.WithTimeout(ctx, handleEventTimeout)
199+
defer cancel()
200+
192201
switch any.(type) {
193202
// If containerd-shim exits unexpectedly, there will be no corresponding event.
194203
// However, containerd could not retrieve container state in that case, so it's

pkg/server/restart.go

+20-3
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,23 @@ func (c *criService) recover(ctx context.Context) error {
136136
return nil
137137
}
138138

139+
// loadContainerTimeout is the default timeout for loading a container/sandbox.
140+
// One container/sandbox hangs (e.g. containerd#2438) should not affect other
141+
// containers/sandboxes.
142+
// Most CRI container/sandbox related operations are per container, the ones
143+
// which handle multiple containers at a time are:
144+
// * ListPodSandboxes: Don't talk with containerd services.
145+
// * ListContainers: Don't talk with containerd services.
146+
// * ListContainerStats: Not in critical code path, a default timeout will
147+
// be applied at CRI level.
148+
// * Recovery logic: We should set a time for each container/sandbox recovery.
149+
// * Event montior: We should set a timeout for each container/sandbox event handling.
150+
const loadContainerTimeout = 10 * time.Second
151+
139152
// loadContainer loads container from containerd and status checkpoint.
140153
func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) {
154+
ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
155+
defer cancel()
141156
id := cntr.ID()
142157
containerDir := c.getContainerRootDir(id)
143158
volatileContainerDir := c.getVolatileContainerRootDir(id)
@@ -290,16 +305,18 @@ const (
290305
// unknownContainerStatus returns the default container status when its status is unknown.
291306
func unknownContainerStatus() containerstore.Status {
292307
return containerstore.Status{
293-
CreatedAt: time.Now().UnixNano(),
294-
StartedAt: time.Now().UnixNano(),
295-
FinishedAt: time.Now().UnixNano(),
308+
CreatedAt: 0,
309+
StartedAt: 0,
310+
FinishedAt: 0,
296311
ExitCode: unknownExitCode,
297312
Reason: unknownExitReason,
298313
}
299314
}
300315

301316
// loadSandbox loads sandbox from containerd.
302317
func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
318+
ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
319+
defer cancel()
303320
var sandbox sandboxstore.Sandbox
304321
// Load sandbox metadata.
305322
exts, err := cntr.Extensions(ctx)

pkg/store/image/fake_image.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ func NewFakeStore(images []Image) (*Store, error) {
2727
s.refCache[ref] = i.ID
2828
}
2929
if err := s.store.add(i); err != nil {
30-
return nil, errors.Wrapf(err, "add image %q", i)
30+
return nil, errors.Wrapf(err, "add image %+v", i)
3131
}
3232
}
3333
return s, nil

0 commit comments

Comments
 (0)