Skip to content

Commit ab9fedd

Browse files
committed
integration: regression test for issue 10589
This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist. Relates to #10589 Signed-off-by: Samuel Karp <[email protected]> (cherry picked from commit 18725f0) Signed-off-by: Samuel Karp <[email protected]>
1 parent d0989e9 commit ab9fedd

4 files changed

Lines changed: 365 additions & 0 deletions

File tree

integration/client/container_linux_test.go

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,17 @@ import (
3535
. "github.com/containerd/containerd"
3636
"github.com/containerd/containerd/cio"
3737
"github.com/containerd/containerd/containers"
38+
"github.com/containerd/containerd/integration/failpoint"
3839
"github.com/containerd/containerd/oci"
40+
"github.com/containerd/containerd/pkg/fifosync"
3941
"github.com/containerd/containerd/plugin"
4042
"github.com/containerd/containerd/runtime/linux/runctypes"
4143
"github.com/containerd/containerd/runtime/v2/runc/options"
4244
"github.com/containerd/containerd/sys"
4345
"github.com/containerd/errdefs"
4446

4547
"github.com/opencontainers/runtime-spec/specs-go"
48+
"github.com/stretchr/testify/assert"
4649
"github.com/stretchr/testify/require"
4750
exec "golang.org/x/sys/execabs"
4851
"golang.org/x/sys/unix"
@@ -1591,3 +1594,210 @@ func TestIssue9103(t *testing.T) {
15911594
})
15921595
}
15931596
}
1597+
1598+
// TestIssue10589 is used as regression case for issue 10589.
1599+
//
1600+
// This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1601+
// by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1602+
// allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1603+
//
1604+
// The workflow is as follows:
1605+
// 1. Create a container as normal
1606+
// 2. Make an exec1 using runc-fp with delayexec
1607+
// 3. Wait until the exec is waiting to start (triggered by delayexec)
1608+
// 4. Kill the container init process (signalling it is easiest)
1609+
// 5. Make an exec2 using runc-fp with delayexec
1610+
// 6. Wait until the exec is waiting to start
1611+
// 7. Allow exec1 to proceed
1612+
// 8. Allow exec2 to proceed
1613+
// 9. See that the container has exited and all execs have exited too
1614+
//
1615+
// https://github.com/containerd/containerd/issues/10589
1616+
func TestIssue10589(t *testing.T) {
1617+
if f := os.Getenv("RUNC_FLAVOR"); f != "" && f != "runc" {
1618+
t.Skip("test requires runc")
1619+
}
1620+
if rt := os.Getenv("TEST_RUNTIME"); rt != "" && rt != plugin.RuntimeRuncV2 {
1621+
t.Skip("test requires io.containerd.runc.v2")
1622+
}
1623+
1624+
client, err := newClient(t, address)
1625+
require.NoError(t, err)
1626+
t.Cleanup(func() {
1627+
client.Close()
1628+
})
1629+
1630+
var (
1631+
image Image
1632+
ctx, cancel = testContext(t)
1633+
id = t.Name()
1634+
)
1635+
t.Cleanup(cancel)
1636+
1637+
image, err = client.GetImage(ctx, testImage)
1638+
require.NoError(t, err)
1639+
1640+
// 1. Create a sleeping container
1641+
t.Log("1. Create a sleeping container")
1642+
container, err := client.NewContainer(ctx, id,
1643+
WithNewSnapshot(id, image),
1644+
WithNewSpec(oci.WithImageConfig(image),
1645+
withProcessArgs("sleep", "inf"),
1646+
oci.WithAnnotations(map[string]string{
1647+
"oci.runc.failpoint.profile": "delayExec",
1648+
}),
1649+
),
1650+
WithRuntime(client.Runtime(), &options.Options{
1651+
BinaryName: "runc-fp",
1652+
}),
1653+
)
1654+
require.NoError(t, err, "create container")
1655+
t.Cleanup(func() {
1656+
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
1657+
err := container.Delete(ctx, WithSnapshotCleanup)
1658+
if err != nil {
1659+
t.Log("delete err", err)
1660+
}
1661+
cancel()
1662+
})
1663+
1664+
task, err := container.NewTask(ctx, empty())
1665+
require.NoError(t, err, "create task")
1666+
t.Cleanup(func() {
1667+
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
1668+
st, err := task.Delete(ctx, WithProcessKill)
1669+
t.Log("exit status", st)
1670+
if err != nil {
1671+
t.Log("kill err", err)
1672+
}
1673+
cancel()
1674+
})
1675+
1676+
err = task.Start(ctx)
1677+
require.NoError(t, err, "start container")
1678+
1679+
status, err := task.Status(ctx)
1680+
require.NoError(t, err, "container status")
1681+
require.Equal(t, Running, status.Status)
1682+
1683+
// 2. Create an exec
1684+
t.Log("2. Create exec1")
1685+
exec1ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec1-ready.fifo"), 0600)
1686+
require.NoError(t, err, "create exec1 ready fifo")
1687+
exec1DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec1-delay.fifo"), 0600)
1688+
require.NoError(t, err, "create exec1 delay fifo")
1689+
exec1, err := task.Exec(ctx, "exec1", &specs.Process{
1690+
Args: []string{"/bin/sleep", "301"},
1691+
Cwd: "/",
1692+
Env: []string{
1693+
failpoint.DelayExecReadyEnv + "=" + exec1ReadyFifo.Name(),
1694+
failpoint.DelayExecDelayEnv + "=" + exec1DelayFifo.Name(),
1695+
},
1696+
}, cio.NullIO)
1697+
require.NoError(t, err, "create exec1")
1698+
1699+
exec1done := make(chan struct{})
1700+
go func() {
1701+
defer close(exec1done)
1702+
t.Log("Starting exec1")
1703+
err := exec1.Start(ctx)
1704+
assert.Error(t, err, "start exec1")
1705+
t.Logf("error starting exec1: %s", err)
1706+
}()
1707+
1708+
// 3. Wait until the exec is waiting to start
1709+
t.Log("3. Wait until exec1 is waiting to start")
1710+
err = exec1ReadyFifo.Wait()
1711+
require.NoError(t, err, "open exec1 fifo")
1712+
1713+
// 4. Kill the container init process
1714+
t.Log("4. Kill the container init process")
1715+
target := task.Pid()
1716+
t.Logf("Killing main pid (%v) of container %s", target, container.ID())
1717+
syscall.Kill(int(target), syscall.SIGKILL)
1718+
status, err = task.Status(ctx)
1719+
require.NoError(t, err, "container status")
1720+
t.Log("container status", status.Status)
1721+
1722+
// 5. Make an exec (2) using this failpoint
1723+
t.Log("5. Create exec2")
1724+
exec2ReadyFifo, err := fifosync.NewWaiter(filepath.Join(t.TempDir(), "exec2-ready.fifo"), 0600)
1725+
require.NoError(t, err, "create exec2 ready fifo: %q", exec2ReadyFifo)
1726+
exec2DelayFifo, err := fifosync.NewTrigger(filepath.Join(t.TempDir(), "exec2-delay.fifo"), 0600)
1727+
require.NoError(t, err, "create exec2 delay fifo: %q", exec2DelayFifo)
1728+
exec2, err := task.Exec(ctx, "exec2", &specs.Process{
1729+
Args: []string{"/bin/sleep", "302"},
1730+
Cwd: "/",
1731+
Env: []string{
1732+
failpoint.DelayExecReadyEnv + "=" + exec2ReadyFifo.Name(),
1733+
failpoint.DelayExecDelayEnv + "=" + exec2DelayFifo.Name(),
1734+
},
1735+
}, cio.NullIO)
1736+
require.NoError(t, err, "create exec2")
1737+
1738+
exec2done := make(chan struct{})
1739+
didExec2Run := true
1740+
go func() {
1741+
defer close(exec2done)
1742+
t.Log("Starting exec2")
1743+
err := exec2.Start(ctx)
1744+
assert.Error(t, err, "start exec2")
1745+
t.Logf("error starting exec2: %s", err)
1746+
}()
1747+
1748+
// 6. Wait until the exec is waiting to start
1749+
t.Log("6. Wait until exec2 is waiting to start")
1750+
exec2ready := make(chan struct{})
1751+
go func() {
1752+
exec2ReadyFifo.Wait()
1753+
close(exec2ready)
1754+
}()
1755+
select {
1756+
case <-exec2ready:
1757+
case <-exec2done:
1758+
didExec2Run = false
1759+
}
1760+
1761+
// 7. Allow exec=1 to proceed
1762+
t.Log("7. Allow exec=1 to proceed")
1763+
err = exec1DelayFifo.Trigger()
1764+
assert.NoError(t, err, "trigger exec1 fifo")
1765+
status, err = task.Status(ctx)
1766+
require.NoError(t, err, "container status")
1767+
t.Log("container status", status.Status)
1768+
<-exec1done
1769+
status, err = task.Status(ctx)
1770+
require.NoError(t, err, "container status")
1771+
t.Log("container status", status.Status)
1772+
1773+
// 8. Allow exec=2 to proceed
1774+
if didExec2Run {
1775+
t.Log("8. Allow exec2 to proceed")
1776+
err = exec2DelayFifo.Trigger()
1777+
assert.NoError(t, err, "trigger exec2 fifo")
1778+
status, err = task.Status(ctx)
1779+
require.NoError(t, err, "container status")
1780+
t.Log("container status", status.Status)
1781+
<-exec2done
1782+
status, err = task.Status(ctx)
1783+
require.NoError(t, err, "container status")
1784+
t.Log("container status", status.Status)
1785+
} else {
1786+
t.Log("8. Skip exec2")
1787+
}
1788+
1789+
// 9. Validate
1790+
t.Log("9. Validate")
1791+
status, err = exec1.Status(ctx)
1792+
require.NoError(t, err, "exec1 status")
1793+
t.Logf("exec1 status: %s", status.Status)
1794+
assert.Equal(t, Created, status.Status)
1795+
status, err = exec2.Status(ctx)
1796+
require.NoError(t, err, "exec2 status")
1797+
t.Logf("exec2 status: %s", status.Status)
1798+
assert.Equal(t, Created, status.Status)
1799+
status, err = task.Status(ctx)
1800+
t.Logf("task status: %s", status.Status)
1801+
require.NoError(t, err, "container status")
1802+
assert.Equal(t, Stopped, status.Status)
1803+
}
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
//go:build linux
2+
3+
/*
4+
Copyright The containerd Authors.
5+
6+
Licensed under the Apache License, Version 2.0 (the "License");
7+
you may not use this file except in compliance with the License.
8+
You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
*/
18+
19+
package main
20+
21+
import (
22+
"context"
23+
"encoding/json"
24+
"errors"
25+
"fmt"
26+
"io"
27+
"os"
28+
"strings"
29+
30+
"github.com/opencontainers/runtime-spec/specs-go"
31+
"github.com/sirupsen/logrus"
32+
33+
"github.com/containerd/containerd/integration/failpoint"
34+
"github.com/containerd/containerd/pkg/fifosync"
35+
)
36+
37+
// delayExec delays an "exec" command until a trigger is received from the calling test program. This can be used to
38+
// test races around container lifecycle and exec processes.
39+
func delayExec(ctx context.Context, method invoker) error {
40+
isExec := strings.Contains(strings.Join(os.Args, ","), ",exec,")
41+
if !isExec {
42+
if err := method(ctx); err != nil {
43+
return err
44+
}
45+
return nil
46+
}
47+
logrus.Debug("EXEC!")
48+
49+
if err := delay(); err != nil {
50+
return err
51+
}
52+
if err := method(ctx); err != nil {
53+
return err
54+
}
55+
return nil
56+
}
57+
58+
func delay() error {
59+
ready, delay, err := fifoFromProcessEnv()
60+
if err != nil {
61+
return err
62+
}
63+
if err := ready.Trigger(); err != nil {
64+
return err
65+
}
66+
return delay.Wait()
67+
}
68+
69+
// fifoFromProcessEnv finds a fifo specified in the environment variables of an exec process
70+
func fifoFromProcessEnv() (fifosync.Trigger, fifosync.Waiter, error) {
71+
env, err := processEnvironment()
72+
if err != nil {
73+
return nil, nil, err
74+
}
75+
76+
readyName, ok := env[failpoint.DelayExecReadyEnv]
77+
if !ok {
78+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecReadyEnv, env)
79+
}
80+
delayName, ok := env[failpoint.DelayExecDelayEnv]
81+
if !ok {
82+
return nil, nil, fmt.Errorf("fifo: failed to find %q env var in %v", failpoint.DelayExecDelayEnv, env)
83+
}
84+
logrus.WithField("ready", readyName).WithField("delay", delayName).Debug("Found FIFOs!")
85+
readyFIFO, err := fifosync.NewTrigger(readyName, 0600)
86+
if err != nil {
87+
return nil, nil, err
88+
}
89+
delayFIFO, err := fifosync.NewWaiter(delayName, 0600)
90+
if err != nil {
91+
return nil, nil, err
92+
}
93+
return readyFIFO, delayFIFO, nil
94+
}
95+
96+
func processEnvironment() (map[string]string, error) {
97+
idx := 2
98+
for ; idx < len(os.Args); idx++ {
99+
if os.Args[idx] == "--process" {
100+
break
101+
}
102+
}
103+
104+
if idx >= len(os.Args)-1 || os.Args[idx] != "--process" {
105+
return nil, errors.New("env: option --process required")
106+
}
107+
108+
specFile := os.Args[idx+1]
109+
f, err := os.OpenFile(specFile, os.O_RDONLY, 0o644)
110+
if err != nil {
111+
return nil, fmt.Errorf("env: failed to open %s: %w", specFile, err)
112+
}
113+
114+
b, err := io.ReadAll(f)
115+
if err != nil {
116+
return nil, fmt.Errorf("env: failed to read spec from %q", specFile)
117+
}
118+
var spec specs.Process
119+
if err := json.Unmarshal(b, &spec); err != nil {
120+
return nil, fmt.Errorf("env: failed to unmarshal spec from %q: %w", specFile, err)
121+
}
122+
123+
// XXX: env vars can be specified multiple times, but we only keep one
124+
env := make(map[string]string)
125+
for _, e := range spec.Env {
126+
k, v, _ := strings.Cut(e, "=")
127+
env[k] = v
128+
}
129+
130+
return env, nil
131+
}

integration/failpoint/cmd/runc-fp/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"syscall"
2727

2828
"github.com/containerd/containerd/oci"
29+
2930
"github.com/sirupsen/logrus"
3031
)
3132

@@ -40,6 +41,7 @@ type invokerInterceptor func(context.Context, invoker) error
4041
var (
4142
failpointProfiles = map[string]invokerInterceptor{
4243
"issue9103": issue9103KillInitAfterCreate,
44+
"delayExec": delayExec,
4345
}
4446
)
4547

integration/failpoint/const.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
/*
2+
Copyright The containerd Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package failpoint
18+
19+
const (
20+
DelayExecReadyEnv = "_RUNC_FP_DELAY_EXEC_READY"
21+
DelayExecDelayEnv = "_RUNC_FP_DELAY_EXEC_DELAY"
22+
)

0 commit comments

Comments
 (0)