Skip to content

Commit 18725f0

Browse files
committed
integration: regression test for issue 10589
This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist. Relates to #10589 Signed-off-by: Samuel Karp <[email protected]>
1 parent 5f37a2c commit 18725f0

4 files changed

Lines changed: 365 additions & 2 deletions

File tree

integration/client/container_linux_test.go

Lines changed: 209 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,18 @@ import (
3434
"github.com/containerd/cgroups/v3/cgroup1"
3535
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
3636
"github.com/containerd/containerd/api/types/runc/options"
37+
"github.com/containerd/errdefs"
38+
"github.com/stretchr/testify/assert"
39+
3740
. "github.com/containerd/containerd/v2/client"
3841
"github.com/containerd/containerd/v2/core/containers"
42+
"github.com/containerd/containerd/v2/integration/failpoint"
3943
"github.com/containerd/containerd/v2/pkg/cio"
44+
"github.com/containerd/containerd/v2/pkg/fifosync"
4045
"github.com/containerd/containerd/v2/pkg/oci"
4146
"github.com/containerd/containerd/v2/pkg/shim"
4247
"github.com/containerd/containerd/v2/pkg/sys"
4348
"github.com/containerd/containerd/v2/plugins"
44-
"github.com/containerd/errdefs"
4549

4650
"github.com/opencontainers/runtime-spec/specs-go"
4751
"github.com/stretchr/testify/require"
@@ -1551,3 +1555,207 @@ func TestIssue9103(t *testing.T) {
15511555
})
15521556
}
15531557
}
1558+
1559+
// TestIssue10589 is used as regression case for issue 10589.
1560+
//
1561+
// This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1562+
// by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1563+
// allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1564+
//
1565+
// The workflow is as follows:
1566+
// 1. Create a container as normal
1567+
// 2. Make an exec1 using runc-fp with delayexec
1568+
// 3. Wait until the exec is waiting to start (triggered by delayexec)
1569+
// 4. Kill the container init process (signalling it is easiest)
1570+
// 5. Make an exec2 using runc-fp with delayexec
1571+
// 6. Wait until the exec is waiting to start
1572+
// 7. Allow exec1 to proceed
1573+
// 8. Allow exec2 to proceed
1574+
// 9. See that the container has exited and all execs have exited too
1575+
//
1576+
// https://github.com/containerd/containerd/issues/10589
1577+
func TestIssue10589(t *testing.T) {
1578+
if f := os.Getenv("RUNC_FLAVOR"); f != "" && f != "runc" {
1579+
t.Skip("test requires runc")
1580+
}
1581+
1582+
client, err := newClient(t, address)
1583+
require.NoError(t, err)
1584+
t.Cleanup(func() {
1585+
client.Close()
1586+
})
1587+
1588+
var (
1589+
image Image
1590+
ctx, cancel = testContext(t)
1591+
id = t.Name()
1592+
)
1593+
t.Cleanup(cancel)
1594+
1595+
image, err = client.GetImage(ctx, testImage)
1596+
require.NoError(t, err)
1597+
1598+
// 1. Create a sleeping container
1599+
t.Log("1. Create a sleeping container")
1600+
container, err := client.NewContainer(ctx, id,
1601+
WithNewSnapshot(id, image),
1602+
WithNewSpec(oci.WithImageConfig(image),
1603+
withProcessArgs("sleep", "inf"),
1604+
oci.WithAnnotations(map[string]string{
1605+
"oci.runc.failpoint.profile": "delayExec",
1606+
}),
1607+
),
1608+
WithRuntime(client.Runtime(), &options.Options{
1609+
BinaryName: "runc-fp",
1610+
}),
1611+
)
1612+
require.NoError(t, err, "create container")
1613+
t.Cleanup(func() {
1614+
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
1615+
err := container.Delete(ctx, WithSnapshotCleanup)
1616+
if err != nil {
1617+
t.Log("delete err", err)
1618+
}
1619+
cancel()
1620+
})
1621+
1622+
task, err := container.NewTask(ctx, empty())
1623+
require.NoError(t, err, "create task")
1624+
t.Cleanup(func() {
1625+
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
1626+
st, err := task.Delete(ctx, WithProcessKill)
1627+
t.Log("exit status", st)
1628+
if err != nil {
1629+
t.Log("kill err", err)
1630+
}
1631+
cancel()
1632+
})
1633+
1634+
err = task.Start(ctx)
1635+
require.NoError(t, err, "start container")
1636+
1637+
status, err := task.Status(ctx)
1638+
require.NoError(t, err, "container status")
1639+
require.Equal(t, Running, status.Status)
1640+
1641+
// 2. Create an exec
1642+
t.Log("2. Create exec1")
1643+
exec1ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec1-ready.fifo"), 0600)
1644+
require.NoError(t, err, "create exec1 ready fifo")
1645+
exec1DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec1-delay.fifo"), 0600)
1646+
require.NoError(t, err, "create exec1 delay fifo")
1647+
exec1, err := task.Exec(ctx, "exec1", &specs.Process{
1648+
Args: []string{"/bin/sleep", "301"},
1649+
Cwd: "/",
1650+
Env: []string{
1651+
failpoint.DelayExecReadyEnv + "=" + exec1ReadyFifo.Name(),
1652+
failpoint.DelayExecDelayEnv + "=" + exec1DelayFifo.Name(),
1653+
},
1654+
}, cio.NullIO)
1655+
require.NoError(t, err, "create exec1")
1656+
1657+
exec1done := make(chan struct{})
1658+
go func() {
1659+
defer close(exec1done)
1660+
t.Log("Starting exec1")
1661+
err := exec1.Start(ctx)
1662+
assert.Error(t, err, "start exec1")
1663+
t.Logf("error starting exec1: %s", err)
1664+
}()
1665+
1666+
// 3. Wait until the exec is waiting to start
1667+
t.Log("3. Wait until exec1 is waiting to start")
1668+
err = exec1ReadyFifo.Wait()
1669+
require.NoError(t, err, "open exec1 fifo")
1670+
1671+
// 4. Kill the container init process
1672+
t.Log("4. Kill the container init process")
1673+
target := task.Pid()
1674+
t.Logf("Killing main pid (%v) of container %s", target, container.ID())
1675+
syscall.Kill(int(target), syscall.SIGKILL)
1676+
status, err = task.Status(ctx)
1677+
require.NoError(t, err, "container status")
1678+
t.Log("container status", status.Status)
1679+
1680+
// 5. Make an exec (2) using this failpoint
1681+
t.Log("5. Create exec2")
1682+
exec2ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec2-ready.fifo"), 0600)
1683+
require.NoError(t, err, "create exec2 ready fifo: %q", exec2ReadyFifo)
1684+
exec2DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec2-delay.fifo"), 0600)
1685+
require.NoError(t, err, "create exec2 delay fifo: %q", exec2DelayFifo)
1686+
exec2, err := task.Exec(ctx, "exec2", &specs.Process{
1687+
Args: []string{"/bin/sleep", "302"},
1688+
Cwd: "/",
1689+
Env: []string{
1690+
failpoint.DelayExecReadyEnv + "=" + exec2ReadyFifo.Name(),
1691+
failpoint.DelayExecDelayEnv + "=" + exec2DelayFifo.Name(),
1692+
},
1693+
}, cio.NullIO)
1694+
require.NoError(t, err, "create exec2")
1695+
1696+
exec2done := make(chan struct{})
1697+
didExec2Run := true
1698+
go func() {
1699+
defer close(exec2done)
1700+
t.Log("Starting exec2")
1701+
err := exec2.Start(ctx)
1702+
assert.Error(t, err, "start exec2")
1703+
t.Logf("error starting exec2: %s", err)
1704+
}()
1705+
1706+
// 6. Wait until the exec is waiting to start
1707+
t.Log("6. Wait until exec2 is waiting to start")
1708+
exec2ready := make(chan struct{})
1709+
go func() {
1710+
exec2ReadyFifo.Wait()
1711+
close(exec2ready)
1712+
}()
1713+
select {
1714+
case <-exec2ready:
1715+
case <-exec2done:
1716+
didExec2Run = false
1717+
}
1718+
1719+
// 7. Allow exec=1 to proceed
1720+
t.Log("7. Allow exec=1 to proceed")
1721+
err = exec1DelayFifo.Trigger()
1722+
assert.NoError(t, err, "trigger exec1 fifo")
1723+
status, err = task.Status(ctx)
1724+
require.NoError(t, err, "container status")
1725+
t.Log("container status", status.Status)
1726+
<-exec1done
1727+
status, err = task.Status(ctx)
1728+
require.NoError(t, err, "container status")
1729+
t.Log("container status", status.Status)
1730+
1731+
// 8. Allow exec=2 to proceed
1732+
if didExec2Run {
1733+
t.Log("8. Allow exec2 to proceed")
1734+
err = exec2DelayFifo.Trigger()
1735+
assert.NoError(t, err, "trigger exec2 fifo")
1736+
status, err = task.Status(ctx)
1737+
require.NoError(t, err, "container status")
1738+
t.Log("container status", status.Status)
1739+
<-exec2done
1740+
status, err = task.Status(ctx)
1741+
require.NoError(t, err, "container status")
1742+
t.Log("container status", status.Status)
1743+
} else {
1744+
t.Log("8. Skip exec2")
1745+
}
1746+
1747+
// 9. Validate
1748+
t.Log("9. Validate")
1749+
status, err = exec1.Status(ctx)
1750+
require.NoError(t, err, "exec1 status")
1751+
t.Logf("exec1 status: %s", status.Status)
1752+
assert.Equal(t, Created, status.Status)
1753+
status, err = exec2.Status(ctx)
1754+
require.NoError(t, err, "exec2 status")
1755+
t.Logf("exec2 status: %s", status.Status)
1756+
assert.Equal(t, Created, status.Status)
1757+
status, err = task.Status(ctx)
1758+
t.Logf("task status: %s", status.Status)
1759+
require.NoError(t, err, "container status")
1760+
assert.Equal(t, Stopped, status.Status)
1761+
}
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
//go:build linux
2+
3+
/*
4+
Copyright The containerd Authors.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
*/
18+
19+
package main
20+
21+
import (
22+
"context"
23+
"encoding/json"
24+
"errors"
25+
"fmt"
26+
"io"
27+
"os"
28+
"strings"
29+
30+
"github.com/opencontainers/runtime-spec/specs-go"
31+
"github.com/sirupsen/logrus"
32+
33+
"github.com/containerd/containerd/v2/integration/failpoint"
34+
"github.com/containerd/containerd/v2/pkg/fifosync"
35+
)
36+
37+
// delayExec delays an "exec" command until a trigger is received from the calling test program. This can be used to
38+
// test races around container lifecycle and exec processes.
39+
func delayExec(ctx context.Context, method invoker) error {
40+
isExec := strings.Contains(strings.Join(os.Args, ","), ",exec,")
41+
if !isExec {
42+
if err := method(ctx); err != nil {
43+
return err
44+
}
45+
return nil
46+
}
47+
logrus.Debug("EXEC!")
48+
49+
if err := delay(); err != nil {
50+
return err
51+
}
52+
if err := method(ctx); err != nil {
53+
return err
54+
}
55+
return nil
56+
}
57+
58+
func delay() error {
59+
ready, delay, err := fifoFromProcessEnv()
60+
if err != nil {
61+
return err
62+
}
63+
if err := ready.Trigger(); err != nil {
64+
return err
65+
}
66+
return delay.Wait()
67+
}
68+
69+
// fifoFromProcessEnv finds a fifo specified in the environment variables of an exec process
70+
func fifoFromProcessEnv() (fifosync.Trigger, fifosync.Waiter, error) {
71+
env, err := processEnvironment()
72+
if err != nil {
73+
return nil, nil, err
74+
}
75+
76+
readyName, ok := env[failpoint.DelayExecReadyEnv]
77+
if !ok {
78+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecReadyEnv, env)
79+
}
80+
delayName, ok := env[failpoint.DelayExecDelayEnv]
81+
if !ok {
82+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecDelayEnv, env)
83+
}
84+
logrus.WithField("ready", readyName).WithField("delay", delayName).Debug("Found FIFOs!")
85+
readyFIFO, err := fifosync.NewTrigger(readyName, 0600)
86+
if err != nil {
87+
return nil, nil, err
88+
}
89+
delayFIFO, err := fifosync.NewWaiter(delayName, 0600)
90+
if err != nil {
91+
return nil, nil, err
92+
}
93+
return readyFIFO, delayFIFO, nil
94+
}
95+
96+
func processEnvironment() (map[string]string, error) {
97+
idx := 2
98+
for ; idx < len(os.Args); idx++ {
99+
if os.Args[idx] == "--process" {
100+
break
101+
}
102+
}
103+
104+
if idx >= len(os.Args)-1 || os.Args[idx] != "--process" {
105+
return nil, errors.New("env: option --process required")
106+
}
107+
108+
specFile := os.Args[idx+1]
109+
f, err := os.OpenFile(specFile, os.O_RDONLY, 0o644)
110+
if err != nil {
111+
return nil, fmt.Errorf("env: failed to open %s: %w", specFile, err)
112+
}
113+
114+
b, err := io.ReadAll(f)
115+
if err != nil {
116+
return nil, fmt.Errorf("env: failed to read spec from %q", specFile)
117+
}
118+
var spec specs.Process
119+
if err := json.Unmarshal(b, &spec); err != nil {
120+
return nil, fmt.Errorf("env: failed to unmarshal spec from %q: %w", specFile, err)
121+
}
122+
123+
// XXX: env vars can be specified multiple times, but we only keep one
124+
env := make(map[string]string)
125+
for _, e := range spec.Env {
126+
k, v, _ := strings.Cut(e, "=")
127+
env[k] = v
128+
}
129+
130+
return env, nil
131+
}

integration/failpoint/cmd/runc-fp/main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,9 @@ import (
2525
"os/exec"
2626
"syscall"
2727

28-
"github.com/containerd/containerd/v2/pkg/oci"
2928
"github.com/sirupsen/logrus"
29+
30+
"github.com/containerd/containerd/v2/pkg/oci"
3031
)
3132

3233
const (
@@ -40,6 +41,7 @@ type invokerInterceptor func(context.Context, invoker) error
4041
var (
4142
failpointProfiles = map[string]invokerInterceptor{
4243
"issue9103": issue9103KillInitAfterCreate,
44+
"delayExec": delayExec,
4345
}
4446
)
4547

0 commit comments

Comments
 (0)