Skip to content

Commit f2376e6

Browse files
committed
Update container with sandbox metadata after NetNS is created
Signed-off-by: Qiutong Song <[email protected]> (cherry picked from commit b41d6f4) Signed-off-by: Qiutong Song <[email protected]>
1 parent a91dd67 commit f2376e6

6 files changed

Lines changed: 232 additions & 28 deletions

File tree

integration/main_test.go

Lines changed: 86 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,24 @@
1717
package integration
1818

1919
import (
20+
"bytes"
2021
"context"
2122
"encoding/json"
2223
"errors"
2324
"flag"
2425
"fmt"
26+
"io"
2527
"os"
2628
"path/filepath"
2729
goruntime "runtime"
2830
"strconv"
2931
"strings"
32+
"syscall"
3033
"testing"
3134
"time"
3235

3336
"github.com/containerd/containerd"
37+
"github.com/containerd/containerd/containers"
3438
cri "github.com/containerd/containerd/integration/cri-api/pkg/apis"
3539
"github.com/containerd/containerd/integration/remote"
3640
dialer "github.com/containerd/containerd/integration/util"
@@ -384,12 +388,12 @@ func Randomize(str string) string {
384388
}
385389

386390
// KillProcess kills the process by name. pkill is used.
387-
func KillProcess(name string) error {
391+
func KillProcess(name string, signal syscall.Signal) error {
388392
var command []string
389393
if goruntime.GOOS == "windows" {
390394
command = []string{"taskkill", "/IM", name, "/F"}
391395
} else {
392-
command = []string{"pkill", "-x", fmt.Sprintf("^%s$", name)}
396+
command = []string{"pkill", "-" + strconv.Itoa(int(signal)), "-x", fmt.Sprintf("^%s$", name)}
393397
}
394398

395399
output, err := exec.Command(command[0], command[1:]...).CombinedOutput()
@@ -421,6 +425,80 @@ func PidOf(name string) (int, error) {
421425
return strconv.Atoi(output)
422426
}
423427

428+
// PidsOf returns pid(s) of a process by name
429+
func PidsOf(name string) ([]int, error) {
430+
if len(name) == 0 {
431+
return []int{}, fmt.Errorf("name is required")
432+
}
433+
434+
procDirFD, err := os.Open("/proc")
435+
if err != nil {
436+
return nil, fmt.Errorf("failed to open /proc: %w", err)
437+
}
438+
defer procDirFD.Close()
439+
440+
res := []int{}
441+
for {
442+
fileInfos, err := procDirFD.Readdir(100)
443+
if err != nil {
444+
if err == io.EOF {
445+
break
446+
}
447+
return nil, fmt.Errorf("failed to readdir: %w", err)
448+
}
449+
450+
for _, fileInfo := range fileInfos {
451+
if !fileInfo.IsDir() {
452+
continue
453+
}
454+
455+
pid, err := strconv.Atoi(fileInfo.Name())
456+
if err != nil {
457+
continue
458+
}
459+
460+
exePath, err := os.Readlink(filepath.Join("/proc", fileInfo.Name(), "exe"))
461+
if err != nil {
462+
continue
463+
}
464+
465+
if strings.HasSuffix(exePath, name) {
466+
res = append(res, pid)
467+
}
468+
}
469+
}
470+
return res, nil
471+
}
472+
473+
// PidEnvs returns the environ of pid in key-value pairs.
474+
func PidEnvs(pid int) (map[string]string, error) {
475+
envPath := filepath.Join("/proc", strconv.Itoa(pid), "environ")
476+
477+
b, err := os.ReadFile(envPath)
478+
if err != nil {
479+
return nil, fmt.Errorf("failed to read %s: %w", envPath, err)
480+
}
481+
482+
values := bytes.Split(b, []byte{0})
483+
if len(values) == 0 {
484+
return nil, nil
485+
}
486+
487+
res := make(map[string]string)
488+
for _, value := range values {
489+
value := strings.TrimSpace(string(value))
490+
if len(value) == 0 {
491+
continue
492+
}
493+
494+
parts := strings.SplitN(value, "=", 2)
495+
if len(parts) == 2 {
496+
res[parts[0]] = parts[1]
497+
}
498+
}
499+
return res, nil
500+
}
501+
424502
// RawRuntimeClient returns a raw grpc runtime service client.
425503
func RawRuntimeClient() (runtime.RuntimeServiceClient, error) {
426504
addr, dialer, err := dialer.GetAddressAndDialer(*criEndpoint)
@@ -477,8 +555,8 @@ func SandboxInfo(id string) (*runtime.PodSandboxStatus, *server.SandboxInfo, err
477555
return status, &info, nil
478556
}
479557

480-
func RestartContainerd(t *testing.T) {
481-
require.NoError(t, KillProcess(*containerdBin))
558+
func RestartContainerd(t *testing.T, signal syscall.Signal) {
559+
require.NoError(t, KillProcess(*containerdBin, signal))
482560

483561
// Use assert so that the 3rd wait always runs, this makes sure
484562
// containerd is running before this function returns.
@@ -494,3 +572,7 @@ func RestartContainerd(t *testing.T) {
494572
return ConnectDaemons() == nil, nil
495573
}, time.Second, 30*time.Second), "wait for containerd to be restarted")
496574
}
575+
576+
func GetContainer(id string) (containers.Container, error) {
577+
return containerdClient.ContainerService().Get(context.Background(), id)
578+
}

integration/restart_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ func TestContainerdRestart(t *testing.T) {
177177
assert.NoError(t, err)
178178

179179
t.Logf("Restart containerd")
180-
RestartContainerd(t)
180+
RestartContainerd(t, syscall.SIGTERM)
181181

182182
t.Logf("Check sandbox and container state after restart")
183183
loadedSandboxes, err := runtimeService.ListPodSandbox(&runtime.PodSandboxFilter{})

integration/sandbox_run_rollback_test.go

Lines changed: 136 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,24 @@ import (
2626
"path/filepath"
2727
"runtime"
2828
"strings"
29+
"sync"
30+
"syscall"
2931
"testing"
32+
"time"
3033

34+
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
35+
"github.com/stretchr/testify/assert"
3136
"github.com/stretchr/testify/require"
3237
criapiv1 "k8s.io/cri-api/pkg/apis/runtime/v1"
3338

39+
"github.com/containerd/containerd/pkg/cri/store/sandbox"
3440
"github.com/containerd/containerd/pkg/failpoint"
41+
"github.com/containerd/typeurl"
3542
)
3643

3744
const (
3845
failpointRuntimeHandler = "runc-fp"
46+
failpointCNIBinary = "cni-bridge-fp"
3947

4048
failpointShimPrefixKey = "io.containerd.runtime.v2.shim.failpoint."
4149

@@ -138,7 +146,7 @@ func TestRunPodSandboxWithShimDeleteFailure(t *testing.T) {
138146

139147
if restart {
140148
t.Log("Restart containerd")
141-
RestartContainerd(t)
149+
RestartContainerd(t, syscall.SIGTERM)
142150

143151
t.Log("ListPodSandbox with the specific label")
144152
l, err = runtimeService.ListPodSandbox(&criapiv1.PodSandboxFilter{Id: sb.Id})
@@ -211,7 +219,7 @@ func TestRunPodSandboxWithShimStartAndTeardownCNIFailure(t *testing.T) {
211219

212220
if restart {
213221
t.Log("Restart containerd")
214-
RestartContainerd(t)
222+
RestartContainerd(t, syscall.SIGTERM)
215223

216224
t.Log("ListPodSandbox with the specific label")
217225
l, err = runtimeService.ListPodSandbox(&criapiv1.PodSandboxFilter{Id: sb.Id})
@@ -229,6 +237,132 @@ func TestRunPodSandboxWithShimStartAndTeardownCNIFailure(t *testing.T) {
229237
t.Run("JustCleanup", testCase(false))
230238
}
231239

240+
// TestRunPodSandboxWithShimStartAndTeardownCNISlow should keep the sandbox
241+
// record if failed to rollback CNI API.
242+
func TestRunPodSandboxAndTeardownCNISlow(t *testing.T) {
243+
if runtime.GOOS != "linux" {
244+
t.Skip()
245+
}
246+
if os.Getenv("ENABLE_CRI_SANDBOXES") != "" {
247+
t.Skip()
248+
}
249+
250+
t.Log("Init PodSandboxConfig with specific key")
251+
sbName := t.Name()
252+
labels := map[string]string{
253+
sbName: "true",
254+
}
255+
sbConfig := PodSandboxConfig(sbName, "failpoint", WithPodLabels(labels))
256+
257+
t.Log("Inject CNI failpoint")
258+
conf := &failpointConf{
259+
// Delay 1 day
260+
Add: "1*delay(86400000)",
261+
}
262+
injectCNIFailpoint(t, sbConfig, conf)
263+
264+
var wg sync.WaitGroup
265+
wg.Add(1)
266+
267+
go func() {
268+
defer wg.Done()
269+
t.Log("Create a sandbox")
270+
_, err := runtimeService.RunPodSandbox(sbConfig, failpointRuntimeHandler)
271+
require.Error(t, err)
272+
require.Contains(t, err.Error(), "error reading from server: EOF")
273+
}()
274+
275+
assert.NoError(t, ensureCNIAddRunning(t, sbName), "check that failpoint CNI.Add is running")
276+
277+
// Use SIGKILL to prevent containerd server gracefulshutdown which may cause indeterministic invocation of defer functions
278+
t.Log("Restart containerd")
279+
RestartContainerd(t, syscall.SIGKILL)
280+
281+
wg.Wait()
282+
283+
t.Log("ListPodSandbox with the specific label")
284+
l, err := runtimeService.ListPodSandbox(&criapiv1.PodSandboxFilter{LabelSelector: labels})
285+
require.NoError(t, err)
286+
require.Len(t, l, 1)
287+
288+
sb := l[0]
289+
290+
defer func() {
291+
t.Log("Cleanup leaky sandbox")
292+
err := runtimeService.StopPodSandbox(sb.Id)
293+
assert.NoError(t, err)
294+
err = runtimeService.RemovePodSandbox(sb.Id)
295+
require.NoError(t, err)
296+
}()
297+
298+
assert.Equal(t, sb.State, criapiv1.PodSandboxState_SANDBOX_NOTREADY)
299+
assert.Equal(t, sb.Metadata.Name, sbConfig.Metadata.Name)
300+
assert.Equal(t, sb.Metadata.Namespace, sbConfig.Metadata.Namespace)
301+
assert.Equal(t, sb.Metadata.Uid, sbConfig.Metadata.Uid)
302+
assert.Equal(t, sb.Metadata.Attempt, sbConfig.Metadata.Attempt)
303+
304+
t.Log("Get sandbox info")
305+
_, info, err := SandboxInfo(sb.Id)
306+
require.NoError(t, err)
307+
require.False(t, info.NetNSClosed)
308+
309+
var netNS string
310+
for _, n := range info.RuntimeSpec.Linux.Namespaces {
311+
if n.Type == runtimespec.NetworkNamespace {
312+
netNS = n.Path
313+
}
314+
}
315+
assert.NotEmpty(t, netNS, "network namespace should be set")
316+
317+
t.Log("Get sandbox container")
318+
c, err := GetContainer(sb.Id)
319+
require.NoError(t, err)
320+
any, ok := c.Extensions["io.cri-containerd.sandbox.metadata"]
321+
require.True(t, ok, "sandbox metadata should exist in extension")
322+
i, err := typeurl.UnmarshalAny(&any)
323+
require.NoError(t, err)
324+
require.IsType(t, &sandbox.Metadata{}, i)
325+
metadata, ok := i.(*sandbox.Metadata)
326+
require.True(t, ok)
327+
assert.NotEmpty(t, metadata.NetNSPath)
328+
assert.Equal(t, netNS, metadata.NetNSPath, "network namespace path should be the same in runtime spec and sandbox metadata")
329+
}
330+
331+
func ensureCNIAddRunning(t *testing.T, sbName string) error {
332+
return Eventually(func() (bool, error) {
333+
pids, err := PidsOf(failpointCNIBinary)
334+
if err != nil || len(pids) == 0 {
335+
return false, err
336+
}
337+
338+
for _, pid := range pids {
339+
envs, err := PidEnvs(pid)
340+
if err != nil {
341+
t.Logf("failed to read environ of pid %v: %v: skip it", pid, err)
342+
continue
343+
}
344+
345+
args, ok := envs["CNI_ARGS"]
346+
if !ok {
347+
t.Logf("expected CNI_ARGS env but got nothing, skip pid=%v", pid)
348+
continue
349+
}
350+
351+
for _, arg := range strings.Split(args, ";") {
352+
kv := strings.SplitN(arg, "=", 2)
353+
if len(kv) != 2 {
354+
continue
355+
}
356+
357+
if kv[0] == "K8S_POD_NAME" && kv[1] == sbName {
358+
return true, nil
359+
}
360+
}
361+
}
362+
return false, nil
363+
}, time.Second, 30*time.Second)
364+
}
365+
232366
// failpointConf is used to describe cmdAdd/cmdDel/cmdCheck command's failpoint.
233367
type failpointConf struct {
234368
Add string `json:"cmdAdd"`

pkg/cri/server/container_update_resources_other.go

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,6 @@ import (
2323
"context"
2424
"fmt"
2525

26-
"github.com/containerd/containerd"
27-
"github.com/containerd/containerd/containers"
28-
"github.com/containerd/typeurl"
29-
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
3026
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
3127

3228
containerstore "github.com/containerd/containerd/pkg/cri/store/container"
@@ -48,19 +44,3 @@ func (c *criService) UpdateContainerResources(ctx context.Context, r *runtime.Up
4844
}
4945
return &runtime.UpdateContainerResourcesResponse{}, nil
5046
}
51-
52-
// TODO: Copied from container_update_resources.go because that file is not built for darwin.
53-
// updateContainerSpec updates container spec.
54-
func updateContainerSpec(ctx context.Context, cntr containerd.Container, spec *runtimespec.Spec) error {
55-
any, err := typeurl.MarshalAny(spec)
56-
if err != nil {
57-
return fmt.Errorf("failed to marshal spec %+v: %w", spec, err)
58-
}
59-
if err := cntr.Update(ctx, func(ctx context.Context, client *containerd.Client, c *containers.Container) error {
60-
c.Spec = any
61-
return nil
62-
}); err != nil {
63-
return fmt.Errorf("failed to update container spec: %w", err)
64-
}
65-
return nil
66-
}

pkg/cri/server/sandbox_run.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,12 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
290290

291291
// Update network namespace in the container's spec
292292
c.updateNetNamespacePath(spec, sandbox.NetNSPath)
293-
if err := updateContainerSpec(ctx, container, spec); err != nil {
293+
294+
if err := container.Update(ctx,
295+
// Update spec of the container
296+
containerd.UpdateContainerOpts(containerd.WithSpec(spec)),
297+
// Update sandbox metadata to include NetNS info
298+
containerd.UpdateContainerOpts(containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata))); err != nil {
294299
return nil, fmt.Errorf("failed to update the network namespace for the sandbox container %q: %w", id, err)
295300
}
296301

script/test/utils.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,9 @@ run_crictl() {
271271
# keepalive runs a command and keeps it alive.
272272
# keepalive process is eventually killed in test_teardown.
273273
keepalive() {
274+
# The command may return non-zero and we want to continue this script.
275+
# e.g. containerd receives SIGKILL
276+
set +e
274277
local command=$1
275278
echo "${command}"
276279
local wait_period=$2

0 commit comments

Comments
 (0)