Skip to content
This repository was archived by the owner on Mar 9, 2022. It is now read-only.

Commit cdbb238

Browse files
authored
Merge pull request #1136 from Random-Liu/cherrypick-#1133-release-1.2
Cherrypick #1133 release 1.2
2 parents cb0eb47 + 27b70db commit cdbb238

18 files changed

Lines changed: 387 additions & 264 deletions

pkg/server/container_remove.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,9 @@ func setContainerRemoving(container containerstore.Container) error {
105105
if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN {
106106
return status, errors.New("container state is unknown, to stop first")
107107
}
108+
if status.Starting {
109+
return status, errors.New("container is in starting state, can't be removed")
110+
}
108111
if status.Removing {
109112
return status, errors.New("container is already in removing state")
110113
}

pkg/server/container_remove_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ func TestSetContainerRemoving(t *testing.T) {
4040
},
4141
expectErr: true,
4242
},
43+
"should return error when container is in starting state": {
44+
status: containerstore.Status{
45+
CreatedAt: time.Now().UnixNano(),
46+
Starting: true,
47+
},
48+
expectErr: true,
49+
},
4350
"should return error when container is in removing state": {
4451
status: containerstore.Status{
4552
CreatedAt: time.Now().UnixNano(),
@@ -71,6 +78,8 @@ func TestSetContainerRemoving(t *testing.T) {
7178
} else {
7279
assert.NoError(t, err)
7380
assert.True(t, container.Status.Get().Removing, "removing should be set")
81+
assert.NoError(t, resetContainerRemoving(container))
82+
assert.False(t, container.Status.Get().Removing, "removing should be reset")
7483
}
7584
}
7685
}

pkg/server/container_start.go

Lines changed: 72 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -38,64 +38,48 @@ import (
3838

3939
// StartContainer starts the container.
4040
func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) {
41-
container, err := c.containerStore.Get(r.GetContainerId())
41+
cntr, err := c.containerStore.Get(r.GetContainerId())
4242
if err != nil {
4343
return nil, errors.Wrapf(err, "an error occurred when try to find container %q", r.GetContainerId())
4444
}
4545

46-
var startErr error
47-
// update container status in one transaction to avoid race with event monitor.
48-
if err := container.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
49-
// Always apply status change no matter startContainer fails or not. Because startContainer
50-
// may change container state no matter it fails or succeeds.
51-
startErr = c.startContainer(ctx, container, &status)
52-
return status, nil
53-
}); startErr != nil {
54-
return nil, startErr
55-
} else if err != nil {
56-
return nil, errors.Wrapf(err, "failed to update container %q metadata", container.ID)
57-
}
58-
return &runtime.StartContainerResponse{}, nil
59-
}
60-
61-
// startContainer actually starts the container. The function needs to be run in one transaction. Any updates
62-
// to the status passed in will be applied no matter the function returns error or not.
63-
func (c *criService) startContainer(ctx context.Context,
64-
cntr containerstore.Container,
65-
status *containerstore.Status) (retErr error) {
6646
id := cntr.ID
6747
meta := cntr.Metadata
6848
container := cntr.Container
6949
config := meta.Config
7050

71-
// Return error if container is not in created state.
72-
if status.State() != runtime.ContainerState_CONTAINER_CREATED {
73-
return errors.Errorf("container %q is in %s state", id, criContainerStateToString(status.State()))
74-
}
75-
// Do not start the container when there is a removal in progress.
76-
if status.Removing {
77-
return errors.Errorf("container %q is in removing state", id)
51+
// Set starting state to prevent other start/remove operations against this container
52+
// while it's being started.
53+
if err := setContainerStarting(cntr); err != nil {
54+
return nil, errors.Wrapf(err, "failed to set starting state for container %q", id)
7855
}
79-
8056
defer func() {
8157
if retErr != nil {
8258
// Set container to exited if fail to start.
83-
status.Pid = 0
84-
status.FinishedAt = time.Now().UnixNano()
85-
status.ExitCode = errorStartExitCode
86-
status.Reason = errorStartReason
87-
status.Message = retErr.Error()
59+
if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
60+
status.Pid = 0
61+
status.FinishedAt = time.Now().UnixNano()
62+
status.ExitCode = errorStartExitCode
63+
status.Reason = errorStartReason
64+
status.Message = retErr.Error()
65+
return status, nil
66+
}); err != nil {
67+
logrus.WithError(err).Errorf("failed to set start failure state for container %q", id)
68+
}
69+
}
70+
if err := resetContainerStarting(cntr); err != nil {
71+
logrus.WithError(err).Errorf("failed to reset starting state for container %q", id)
8872
}
8973
}()
9074

9175
// Get sandbox config from sandbox store.
9276
sandbox, err := c.sandboxStore.Get(meta.SandboxID)
9377
if err != nil {
94-
return errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
78+
return nil, errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
9579
}
9680
sandboxID := meta.SandboxID
9781
if sandbox.Status.Get().State != sandboxstore.StateReady {
98-
return errors.Errorf("sandbox container %q is not running", sandboxID)
82+
return nil, errors.Errorf("sandbox container %q is not running", sandboxID)
9983
}
10084

10185
ioCreation := func(id string) (_ containerdio.IO, err error) {
@@ -110,7 +94,7 @@ func (c *criService) startContainer(ctx context.Context,
11094

11195
ctrInfo, err := container.Info(ctx)
11296
if err != nil {
113-
return errors.Wrap(err, "failed to get container info")
97+
return nil, errors.Wrap(err, "failed to get container info")
11498
}
11599

116100
var taskOpts []containerd.NewTaskOpts
@@ -120,7 +104,7 @@ func (c *criService) startContainer(ctx context.Context,
120104
}
121105
task, err := container.NewTask(ctx, ioCreation, taskOpts...)
122106
if err != nil {
123-
return errors.Wrap(err, "failed to create containerd task")
107+
return nil, errors.Wrap(err, "failed to create containerd task")
124108
}
125109
defer func() {
126110
if retErr != nil {
@@ -133,15 +117,61 @@ func (c *criService) startContainer(ctx context.Context,
133117
}
134118
}()
135119

120+
// wait is a long running background request, no timeout needed.
121+
exitCh, err := task.Wait(ctrdutil.NamespacedContext())
122+
if err != nil {
123+
return nil, errors.Wrap(err, "failed to wait for containerd task")
124+
}
125+
136126
// Start containerd task.
137127
if err := task.Start(ctx); err != nil {
138-
return errors.Wrapf(err, "failed to start containerd task %q", id)
128+
return nil, errors.Wrapf(err, "failed to start containerd task %q", id)
139129
}
140130

141131
// Update container start timestamp.
142-
status.Pid = task.Pid()
143-
status.StartedAt = time.Now().UnixNano()
144-
return nil
132+
if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
133+
status.Pid = task.Pid()
134+
status.StartedAt = time.Now().UnixNano()
135+
return status, nil
136+
}); err != nil {
137+
return nil, errors.Wrapf(err, "failed to update container %q state", id)
138+
}
139+
140+
// start the monitor after updating container state, this ensures that
141+
// event monitor receives the TaskExit event and update container state
142+
// after this.
143+
c.eventMonitor.startExitMonitor(context.Background(), id, task.Pid(), exitCh)
144+
145+
return &runtime.StartContainerResponse{}, nil
146+
}
147+
148+
// setContainerStarting sets the container into starting state. In starting state, the
149+
// container will not be removed or started again.
150+
func setContainerStarting(container containerstore.Container) error {
151+
return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
152+
// Return error if container is not in created state.
153+
if status.State() != runtime.ContainerState_CONTAINER_CREATED {
154+
return status, errors.Errorf("container is in %s state", criContainerStateToString(status.State()))
155+
}
156+
// Do not start the container when there is a removal in progress.
157+
if status.Removing {
158+
return status, errors.New("container is in removing state, can't be started")
159+
}
160+
if status.Starting {
161+
return status, errors.New("container is already in starting state")
162+
}
163+
status.Starting = true
164+
return status, nil
165+
})
166+
}
167+
168+
// resetContainerStarting resets the container starting state on start failure. So
169+
// that we could remove the container later.
170+
func resetContainerStarting(container containerstore.Container) error {
171+
return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
172+
status.Starting = false
173+
return status, nil
174+
})
145175
}
146176

147177
// createContainerLoggers creates container loggers and return write closer for stdout and stderr.

pkg/server/container_start_test.go

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
Copyright 2019 The containerd Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package server
18+
19+
import (
20+
"testing"
21+
"time"
22+
23+
"github.com/stretchr/testify/assert"
24+
25+
containerstore "github.com/containerd/cri/pkg/store/container"
26+
)
27+
28+
// TestSetContainerStarting tests setContainerStarting sets removing
29+
// state correctly.
30+
func TestSetContainerStarting(t *testing.T) {
31+
testID := "test-id"
32+
for desc, test := range map[string]struct {
33+
status containerstore.Status
34+
expectErr bool
35+
}{
36+
37+
"should not return error when container is in created state": {
38+
status: containerstore.Status{
39+
CreatedAt: time.Now().UnixNano(),
40+
},
41+
expectErr: false,
42+
},
43+
"should return error when container is in running state": {
44+
status: containerstore.Status{
45+
CreatedAt: time.Now().UnixNano(),
46+
StartedAt: time.Now().UnixNano(),
47+
},
48+
expectErr: true,
49+
},
50+
"should return error when container is in exited state": {
51+
status: containerstore.Status{
52+
CreatedAt: time.Now().UnixNano(),
53+
StartedAt: time.Now().UnixNano(),
54+
FinishedAt: time.Now().UnixNano(),
55+
},
56+
expectErr: true,
57+
},
58+
"should return error when container is in unknown state": {
59+
status: containerstore.Status{
60+
CreatedAt: 0,
61+
StartedAt: 0,
62+
FinishedAt: 0,
63+
},
64+
expectErr: true,
65+
},
66+
"should return error when container is in starting state": {
67+
status: containerstore.Status{
68+
CreatedAt: time.Now().UnixNano(),
69+
Starting: true,
70+
},
71+
expectErr: true,
72+
},
73+
"should return error when container is in removing state": {
74+
status: containerstore.Status{
75+
CreatedAt: time.Now().UnixNano(),
76+
Removing: true,
77+
},
78+
expectErr: true,
79+
},
80+
} {
81+
t.Logf("TestCase %q", desc)
82+
container, err := containerstore.NewContainer(
83+
containerstore.Metadata{ID: testID},
84+
containerstore.WithFakeStatus(test.status),
85+
)
86+
assert.NoError(t, err)
87+
err = setContainerStarting(container)
88+
if test.expectErr {
89+
assert.Error(t, err)
90+
assert.Equal(t, test.status, container.Status.Get(), "metadata should not be updated")
91+
} else {
92+
assert.NoError(t, err)
93+
assert.True(t, container.Status.Get().Starting, "starting should be set")
94+
assert.NoError(t, resetContainerStarting(container))
95+
assert.False(t, container.Status.Get().Starting, "starting should be reset")
96+
}
97+
}
98+
}

pkg/server/container_stop.go

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package server
1919
import (
2020
"time"
2121

22-
"github.com/containerd/containerd"
2322
eventtypes "github.com/containerd/containerd/api/events"
2423
"github.com/containerd/containerd/errdefs"
2524
"github.com/docker/docker/pkg/signal"
@@ -29,6 +28,7 @@ import (
2928
"golang.org/x/sys/unix"
3029
runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"
3130

31+
ctrdutil "github.com/containerd/cri/pkg/containerd/util"
3232
"github.com/containerd/cri/pkg/store"
3333
containerstore "github.com/containerd/cri/pkg/store/container"
3434
)
@@ -75,36 +75,34 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
7575
return errors.Wrapf(err, "failed to get task for container %q", id)
7676
}
7777
// Don't return for unknown state, some cleanup needs to be done.
78-
if state != runtime.ContainerState_CONTAINER_UNKNOWN {
79-
return nil
78+
if state == runtime.ContainerState_CONTAINER_UNKNOWN {
79+
return cleanupUnknownContainer(ctx, id, container)
8080
}
81-
// Task is an interface, explicitly set it to nil just in case.
82-
task = nil
81+
return nil
8382
}
8483

8584
// Handle unknown state.
8685
if state == runtime.ContainerState_CONTAINER_UNKNOWN {
87-
status, err := getTaskStatus(ctx, task)
86+
// Start an exit handler for containers in unknown state.
87+
waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
88+
defer waitCancel()
89+
exitCh, err := task.Wait(waitCtx)
8890
if err != nil {
89-
return errors.Wrapf(err, "failed to get task status for %q", id)
90-
}
91-
switch status.Status {
92-
case containerd.Running, containerd.Created:
93-
// The task is still running, continue stopping the task.
94-
case containerd.Stopped:
95-
// The task has exited. If the task exited after containerd
96-
// started, the event monitor will receive its exit event; if it
97-
// exited before containerd started, the event monitor will never
98-
// receive its exit event.
99-
// However, we can't tell that because the task state was not
100-
// successfully loaded during containerd start (container is
101-
// in UNKNOWN state).
102-
// So always do cleanup here, just in case that we've missed the
103-
// exit event.
104-
return cleanupUnknownContainer(ctx, id, status, container)
105-
default:
106-
return errors.Wrapf(err, "unsupported task status %q", status.Status)
91+
if !errdefs.IsNotFound(err) {
92+
return errors.Wrapf(err, "failed to wait for task for %q", id)
93+
}
94+
return cleanupUnknownContainer(ctx, id, container)
10795
}
96+
97+
exitCtx, exitCancel := context.WithCancel(context.Background())
98+
stopCh := c.eventMonitor.startExitMonitor(exitCtx, id, task.Pid(), exitCh)
99+
defer func() {
100+
exitCancel()
101+
// This ensures that exit monitor is stopped before
102+
// `Wait` is cancelled, so no exit event is generated
103+
// because of the `Wait` cancellation.
104+
<-stopCh
105+
}()
108106
}
109107

110108
// We only need to kill the task. The event handler will Delete the
@@ -177,19 +175,13 @@ func (c *criService) waitContainerStop(ctx context.Context, container containers
177175
}
178176

179177
// cleanupUnknownContainer cleanup stopped container in unknown state.
180-
func cleanupUnknownContainer(ctx context.Context, id string, status containerd.Status,
181-
cntr containerstore.Container) error {
178+
func cleanupUnknownContainer(ctx context.Context, id string, cntr containerstore.Container) error {
182179
// Reuse handleContainerExit to do the cleanup.
183-
// NOTE(random-liu): If the task did exit after containerd started, both
184-
// the event monitor and the cleanup function would update the container
185-
// state. The final container state will be whatever being updated first.
186-
// There is no way to completely avoid this race condition, and for best
187-
// effort unknown state container cleanup, this seems acceptable.
188180
return handleContainerExit(ctx, &eventtypes.TaskExit{
189181
ContainerID: id,
190182
ID: id,
191183
Pid: 0,
192-
ExitStatus: status.ExitStatus,
193-
ExitedAt: status.ExitTime,
184+
ExitStatus: unknownExitCode,
185+
ExitedAt: time.Now(),
194186
}, cntr)
195187
}

0 commit comments

Comments
 (0)