@@ -34,14 +34,18 @@ import (
3434 "github.com/containerd/cgroups/v3/cgroup1"
3535 cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
3636 "github.com/containerd/containerd/api/types/runc/options"
37+ "github.com/containerd/errdefs"
38+ "github.com/stretchr/testify/assert"
39+
3740 . "github.com/containerd/containerd/v2/client"
3841 "github.com/containerd/containerd/v2/core/containers"
42+ "github.com/containerd/containerd/v2/integration/failpoint"
3943 "github.com/containerd/containerd/v2/pkg/cio"
44+ "github.com/containerd/containerd/v2/pkg/fifosync"
4045 "github.com/containerd/containerd/v2/pkg/oci"
4146 "github.com/containerd/containerd/v2/pkg/shim"
4247 "github.com/containerd/containerd/v2/pkg/sys"
4348 "github.com/containerd/containerd/v2/plugins"
44- "github.com/containerd/errdefs"
4549
4650 "github.com/opencontainers/runtime-spec/specs-go"
4751 "github.com/stretchr/testify/require"
@@ -1551,3 +1555,207 @@ func TestIssue9103(t *testing.T) {
15511555 })
15521556 }
15531557}
1558+
1559+ // TestIssue10589 is used as regression case for issue 10589.
1560+ //
1561+ // This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1562+ // by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1563+ // allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1564+ //
1565+ // The workflow is as follows:
1566+ // 1. Create a container as normal
1567+ // 2. Make an exec1 using runc-fp with delayexec
1568+ // 3. Wait until the exec is waiting to start (triggered by delayexec)
1569+ // 4. Kill the container init process (signalling it is easiest)
1570+ // 5. Make an exec2 using runc-fp with delayexec
1571+ // 6. Wait until the exec is waiting to start
1572+ // 7. Allow exec1 to proceed
1573+ // 8. Allow exec2 to proceed
1574+ // 9. See that the container has exited and all execs have exited too
1575+ //
1576+ // https://github.com/containerd/containerd/issues/10589
1577+ func TestIssue10589 (t * testing.T ) {
1578+ if f := os .Getenv ("RUNC_FLAVOR" ); f != "" && f != "runc" {
1579+ t .Skip ("test requires runc" )
1580+ }
1581+
1582+ client , err := newClient (t , address )
1583+ require .NoError (t , err )
1584+ t .Cleanup (func () {
1585+ client .Close ()
1586+ })
1587+
1588+ var (
1589+ image Image
1590+ ctx , cancel = testContext (t )
1591+ id = t .Name ()
1592+ )
1593+ t .Cleanup (cancel )
1594+
1595+ image , err = client .GetImage (ctx , testImage )
1596+ require .NoError (t , err )
1597+
1598+ // 1. Create a sleeping container
1599+ t .Log ("1. Create a sleeping container" )
1600+ container , err := client .NewContainer (ctx , id ,
1601+ WithNewSnapshot (id , image ),
1602+ WithNewSpec (oci .WithImageConfig (image ),
1603+ withProcessArgs ("sleep" , "inf" ),
1604+ oci .WithAnnotations (map [string ]string {
1605+ "oci.runc.failpoint.profile" : "delayExec" ,
1606+ }),
1607+ ),
1608+ WithRuntime (client .Runtime (), & options.Options {
1609+ BinaryName : "runc-fp" ,
1610+ }),
1611+ )
1612+ require .NoError (t , err , "create container" )
1613+ t .Cleanup (func () {
1614+ ctx , cancel := context .WithTimeout (ctx , 10 * time .Second )
1615+ err := container .Delete (ctx , WithSnapshotCleanup )
1616+ if err != nil {
1617+ t .Log ("delete err" , err )
1618+ }
1619+ cancel ()
1620+ })
1621+
1622+ task , err := container .NewTask (ctx , empty ())
1623+ require .NoError (t , err , "create task" )
1624+ t .Cleanup (func () {
1625+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Second )
1626+ st , err := task .Delete (ctx , WithProcessKill )
1627+ t .Log ("exit status" , st )
1628+ if err != nil {
1629+ t .Log ("kill err" , err )
1630+ }
1631+ cancel ()
1632+ })
1633+
1634+ err = task .Start (ctx )
1635+ require .NoError (t , err , "start container" )
1636+
1637+ status , err := task .Status (ctx )
1638+ require .NoError (t , err , "container status" )
1639+ require .Equal (t , Running , status .Status )
1640+
1641+ // 2. Create an exec
1642+ t .Log ("2. Create exec1" )
1643+ exec1ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec1-ready.fifo" ), 0600 )
1644+ require .NoError (t , err , "create exec1 ready fifo" )
1645+ exec1DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec1-delay.fifo" ), 0600 )
1646+ require .NoError (t , err , "create exec1 delay fifo" )
1647+ exec1 , err := task .Exec (ctx , "exec1" , & specs.Process {
1648+ Args : []string {"/bin/sleep" , "301" },
1649+ Cwd : "/" ,
1650+ Env : []string {
1651+ failpoint .DelayExecReadyEnv + "=" + exec1ReadyFifo .Name (),
1652+ failpoint .DelayExecDelayEnv + "=" + exec1DelayFifo .Name (),
1653+ },
1654+ }, cio .NullIO )
1655+ require .NoError (t , err , "create exec1" )
1656+
1657+ exec1done := make (chan struct {})
1658+ go func () {
1659+ defer close (exec1done )
1660+ t .Log ("Starting exec1" )
1661+ err := exec1 .Start (ctx )
1662+ assert .Error (t , err , "start exec1" )
1663+ t .Logf ("error starting exec1: %s" , err )
1664+ }()
1665+
1666+ // 3. Wait until the exec is waiting to start
1667+ t .Log ("3. Wait until exec1 is waiting to start" )
1668+ err = exec1ReadyFifo .Wait ()
1669+ require .NoError (t , err , "open exec1 fifo" )
1670+
1671+ // 4. Kill the container init process
1672+ t .Log ("4. Kill the container init process" )
1673+ target := task .Pid ()
1674+ t .Logf ("Killing main pid (%v) of container %s" , target , container .ID ())
1675+ syscall .Kill (int (target ), syscall .SIGKILL )
1676+ status , err = task .Status (ctx )
1677+ require .NoError (t , err , "container status" )
1678+ t .Log ("container status" , status .Status )
1679+
1680+ // 5. Make an exec (2) using this failpoint
1681+ t .Log ("5. Create exec2" )
1682+ exec2ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec2-ready.fifo" ), 0600 )
1683+ require .NoError (t , err , "create exec2 ready fifo: %q" , exec2ReadyFifo )
1684+ exec2DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec2-delay.fifo" ), 0600 )
1685+ require .NoError (t , err , "create exec2 delay fifo: %q" , exec2DelayFifo )
1686+ exec2 , err := task .Exec (ctx , "exec2" , & specs.Process {
1687+ Args : []string {"/bin/sleep" , "302" },
1688+ Cwd : "/" ,
1689+ Env : []string {
1690+ failpoint .DelayExecReadyEnv + "=" + exec2ReadyFifo .Name (),
1691+ failpoint .DelayExecDelayEnv + "=" + exec2DelayFifo .Name (),
1692+ },
1693+ }, cio .NullIO )
1694+ require .NoError (t , err , "create exec2" )
1695+
1696+ exec2done := make (chan struct {})
1697+ didExec2Run := true
1698+ go func () {
1699+ defer close (exec2done )
1700+ t .Log ("Starting exec2" )
1701+ err := exec2 .Start (ctx )
1702+ assert .Error (t , err , "start exec2" )
1703+ t .Logf ("error starting exec2: %s" , err )
1704+ }()
1705+
1706+ // 6. Wait until the exec is waiting to start
1707+ t .Log ("6. Wait until exec2 is waiting to start" )
1708+ exec2ready := make (chan struct {})
1709+ go func () {
1710+ exec2ReadyFifo .Wait ()
1711+ close (exec2ready )
1712+ }()
1713+ select {
1714+ case <- exec2ready :
1715+ case <- exec2done :
1716+ didExec2Run = false
1717+ }
1718+
1719+ // 7. Allow exec=1 to proceed
1720+ t .Log ("7. Allow exec=1 to proceed" )
1721+ err = exec1DelayFifo .Trigger ()
1722+ assert .NoError (t , err , "trigger exec1 fifo" )
1723+ status , err = task .Status (ctx )
1724+ require .NoError (t , err , "container status" )
1725+ t .Log ("container status" , status .Status )
1726+ <- exec1done
1727+ status , err = task .Status (ctx )
1728+ require .NoError (t , err , "container status" )
1729+ t .Log ("container status" , status .Status )
1730+
1731+ // 8. Allow exec=2 to proceed
1732+ if didExec2Run {
1733+ t .Log ("8. Allow exec2 to proceed" )
1734+ err = exec2DelayFifo .Trigger ()
1735+ assert .NoError (t , err , "trigger exec2 fifo" )
1736+ status , err = task .Status (ctx )
1737+ require .NoError (t , err , "container status" )
1738+ t .Log ("container status" , status .Status )
1739+ <- exec2done
1740+ status , err = task .Status (ctx )
1741+ require .NoError (t , err , "container status" )
1742+ t .Log ("container status" , status .Status )
1743+ } else {
1744+ t .Log ("8. Skip exec2" )
1745+ }
1746+
1747+ // 9. Validate
1748+ t .Log ("9. Validate" )
1749+ status , err = exec1 .Status (ctx )
1750+ require .NoError (t , err , "exec1 status" )
1751+ t .Logf ("exec1 status: %s" , status .Status )
1752+ assert .Equal (t , Created , status .Status )
1753+ status , err = exec2 .Status (ctx )
1754+ require .NoError (t , err , "exec2 status" )
1755+ t .Logf ("exec2 status: %s" , status .Status )
1756+ assert .Equal (t , Created , status .Status )
1757+ status , err = task .Status (ctx )
1758+ t .Logf ("task status: %s" , status .Status )
1759+ require .NoError (t , err , "container status" )
1760+ assert .Equal (t , Stopped , status .Status )
1761+ }
0 commit comments