Skip to content

Commit 4b05d03

Browse files
committed
runtime/v2: cleanup dead shim before delete bundle
The shim delete action needs bundle information to cleanup resources created by shim. If the cleanup dead shim is called after delete bundle, the part of resources maybe leaky. The ttrpc client UserOnCloseWait() can make sure that resources are cleanup before delete bundle, which synchronizes task deletion and cleanup deadshim. It might slow down the task deletion, but it can make sure that resources can be cleanup and avoid EBUSY umount case. For example, the sandbox container like Kata/Firecracker might have mount points over the rootfs. If containerd handles task deletion and cleanup deadshim parallelly, the task deletion will meet EBUSY during umount and fail to cleanup bundle, which makes case worse. And also update cleanupAfterDeadshim, which makes sure that cleanupAfterDeadshim must be called after shim disconnected. In some case, shim fails to call runc-create for some reason, but the runc-create already makes runc-init into ready state. If containerd doesn't call shim deletion, the runc-init process will be leaky and hold the cgroup, which makes pod terminating :(. Signed-off-by: Wei Fu <[email protected]>
1 parent fabebe5 commit 4b05d03

4 files changed

Lines changed: 44 additions & 29 deletions

File tree

runtime/v2/manager.go

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,8 @@ func (m *TaskManager) Create(ctx context.Context, id string, opts runtime.Create
138138
b := shimBinary(ctx, bundle, opts.Runtime, m.containerdAddress, m.containerdTTRPCAddress, m.events, m.tasks)
139139
shim, err := b.Start(ctx, topts, func() {
140140
log.G(ctx).WithField("id", id).Info("shim disconnected")
141-
_, err := m.tasks.Get(ctx, id)
142-
if err != nil {
143-
// Task was never started or was already successfully deleted
144-
return
145-
}
146-
cleanupAfterDeadShim(context.Background(), id, ns, m.events, b)
141+
142+
cleanupAfterDeadShim(context.Background(), id, ns, m.tasks, m.events, b)
147143
// Remove self from the runtime task list. Even though the cleanupAfterDeadShim()
148144
// would publish taskExit event, but the shim.Delete() would always failed with ttrpc
149145
// disconnect and there is no chance to remove this dead task from runtime task lists.
@@ -266,17 +262,13 @@ func (m *TaskManager) loadTasks(ctx context.Context) error {
266262
binaryCall := shimBinary(ctx, bundle, container.Runtime.Name, m.containerdAddress, m.containerdTTRPCAddress, m.events, m.tasks)
267263
shim, err := loadShim(ctx, bundle, m.events, m.tasks, func() {
268264
log.G(ctx).WithField("id", id).Info("shim disconnected")
269-
_, err := m.tasks.Get(ctx, id)
270-
if err != nil {
271-
// Task was never started or was already successfully deleted
272-
return
273-
}
274-
cleanupAfterDeadShim(context.Background(), id, ns, m.events, binaryCall)
265+
266+
cleanupAfterDeadShim(context.Background(), id, ns, m.tasks, m.events, binaryCall)
275267
// Remove self from the runtime task list.
276268
m.tasks.Delete(ctx, id)
277269
})
278270
if err != nil {
279-
cleanupAfterDeadShim(ctx, id, ns, m.events, binaryCall)
271+
cleanupAfterDeadShim(ctx, id, ns, m.tasks, m.events, binaryCall)
280272
continue
281273
}
282274
m.tasks.Add(ctx, shim)

runtime/v2/shim.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ func loadShim(ctx context.Context, bundle *Bundle, events *exchange.Exchange, rt
121121
return s, nil
122122
}
123123

124-
func cleanupAfterDeadShim(ctx context.Context, id, ns string, events *exchange.Exchange, binaryCall *binary) {
124+
func cleanupAfterDeadShim(ctx context.Context, id, ns string, rt *runtime.TaskList, events *exchange.Exchange, binaryCall *binary) {
125125
ctx = namespaces.WithNamespace(ctx, ns)
126126
ctx, cancel := timeout.WithContext(ctx, cleanupTimeout)
127127
defer cancel()
@@ -138,6 +138,12 @@ func cleanupAfterDeadShim(ctx context.Context, id, ns string, events *exchange.E
138138
}).Warn("failed to clean up after shim disconnected")
139139
}
140140

141+
if _, err := rt.Get(ctx, id); err != nil {
142+
// Task was never started or was already successfully deleted
143+
// No need to publish events
144+
return
145+
}
146+
141147
var (
142148
pid uint32
143149
exitStatus uint32
@@ -234,13 +240,15 @@ func (s *shim) Delete(ctx context.Context) (*runtime.Exit, error) {
234240
}
235241
}
236242
}
237-
// remove self from the runtime task list
238-
// this seems dirty but it cleans up the API across runtimes, tasks, and the service
239-
s.rtTasks.Delete(ctx, s.ID())
240243
if err := s.waitShutdown(ctx); err != nil {
241244
log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to shutdown shim")
242245
}
243246
s.Close()
247+
s.client.UserOnCloseWait(ctx)
248+
249+
// remove self from the runtime task list
250+
// this seems dirty but it cleans up the API across runtimes, tasks, and the service
251+
s.rtTasks.Delete(ctx, s.ID())
244252
if err := s.bundle.Delete(); err != nil {
245253
log.G(ctx).WithField("id", s.ID()).WithError(err).Error("failed to delete bundle")
246254
}

vendor.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ github.com/containerd/continuity efbc4488d8fe1bdc16bde3b2d299
88
github.com/containerd/fifo f15a3290365b9d2627d189e619ab4008e0069caf
99
github.com/containerd/go-runc 7016d3ce2328dd2cb1192b2076ebd565c4e8df0c
1010
github.com/containerd/nri 0afc7f031eaf9c7d9c1a381b7ab5462e89c998fc
11-
github.com/containerd/ttrpc v1.0.1
11+
github.com/containerd/ttrpc v1.0.2
1212
github.com/containerd/typeurl v1.0.1
1313
github.com/coreos/go-systemd/v22 v22.1.0
1414
github.com/cpuguy83/go-md2man/v2 v2.0.0

vendor/github.com/containerd/ttrpc/client.go

Lines changed: 26 additions & 11 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)