@@ -35,14 +35,17 @@ import (
3535 . "github.com/containerd/containerd"
3636 "github.com/containerd/containerd/cio"
3737 "github.com/containerd/containerd/containers"
38+ "github.com/containerd/containerd/integration/failpoint"
3839 "github.com/containerd/containerd/oci"
40+ "github.com/containerd/containerd/pkg/fifosync"
3941 "github.com/containerd/containerd/plugin"
4042 "github.com/containerd/containerd/runtime/linux/runctypes"
4143 "github.com/containerd/containerd/runtime/v2/runc/options"
4244 "github.com/containerd/containerd/sys"
4345 "github.com/containerd/errdefs"
4446
4547 "github.com/opencontainers/runtime-spec/specs-go"
48+ "github.com/stretchr/testify/assert"
4649 "github.com/stretchr/testify/require"
4750 exec "golang.org/x/sys/execabs"
4851 "golang.org/x/sys/unix"
@@ -1591,3 +1594,210 @@ func TestIssue9103(t *testing.T) {
15911594 })
15921595 }
15931596}
1597+
1598+ // TestIssue10589 is used as regression case for issue 10589.
1599+ //
1600+ // This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1601+ // by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1602+ // allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1603+ //
1604+ // The workflow is as follows:
1605+ // 1. Create a container as normal
1606+ // 2. Make an exec1 using runc-fp with delayexec
1607+ // 3. Wait until the exec is waiting to start (triggered by delayexec)
1608+ // 4. Kill the container init process (signalling it is easiest)
1609+ // 5. Make an exec2 using runc-fp with delayexec
1610+ // 6. Wait until the exec is waiting to start
1611+ // 7. Allow exec1 to proceed
1612+ // 8. Allow exec2 to proceed
1613+ // 9. See that the container has exited and all execs have exited too
1614+ //
1615+ // https://github.com/containerd/containerd/issues/10589
1616+ func TestIssue10589 (t * testing.T ) {
1617+ if f := os .Getenv ("RUNC_FLAVOR" ); f != "" && f != "runc" {
1618+ t .Skip ("test requires runc" )
1619+ }
1620+ if rt := os .Getenv ("TEST_RUNTIME" ); rt != "" && rt != plugin .RuntimeRuncV2 {
1621+ t .Skip ("test requires io.containerd.runc.v2" )
1622+ }
1623+
1624+ client , err := newClient (t , address )
1625+ require .NoError (t , err )
1626+ t .Cleanup (func () {
1627+ client .Close ()
1628+ })
1629+
1630+ var (
1631+ image Image
1632+ ctx , cancel = testContext (t )
1633+ id = t .Name ()
1634+ )
1635+ t .Cleanup (cancel )
1636+
1637+ image , err = client .GetImage (ctx , testImage )
1638+ require .NoError (t , err )
1639+
1640+ // 1. Create a sleeping container
1641+ t .Log ("1. Create a sleeping container" )
1642+ container , err := client .NewContainer (ctx , id ,
1643+ WithNewSnapshot (id , image ),
1644+ WithNewSpec (oci .WithImageConfig (image ),
1645+ withProcessArgs ("sleep" , "inf" ),
1646+ oci .WithAnnotations (map [string ]string {
1647+ "oci.runc.failpoint.profile" : "delayExec" ,
1648+ }),
1649+ ),
1650+ WithRuntime (client .Runtime (), & options.Options {
1651+ BinaryName : "runc-fp" ,
1652+ }),
1653+ )
1654+ require .NoError (t , err , "create container" )
1655+ t .Cleanup (func () {
1656+ ctx , cancel := context .WithTimeout (ctx , 10 * time .Second )
1657+ err := container .Delete (ctx , WithSnapshotCleanup )
1658+ if err != nil {
1659+ t .Log ("delete err" , err )
1660+ }
1661+ cancel ()
1662+ })
1663+
1664+ task , err := container .NewTask (ctx , empty ())
1665+ require .NoError (t , err , "create task" )
1666+ t .Cleanup (func () {
1667+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Second )
1668+ st , err := task .Delete (ctx , WithProcessKill )
1669+ t .Log ("exit status" , st )
1670+ if err != nil {
1671+ t .Log ("kill err" , err )
1672+ }
1673+ cancel ()
1674+ })
1675+
1676+ err = task .Start (ctx )
1677+ require .NoError (t , err , "start container" )
1678+
1679+ status , err := task .Status (ctx )
1680+ require .NoError (t , err , "container status" )
1681+ require .Equal (t , Running , status .Status )
1682+
1683+ // 2. Create an exec
1684+ t .Log ("2. Create exec1" )
1685+ exec1ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec1-ready.fifo" ), 0600 )
1686+ require .NoError (t , err , "create exec1 ready fifo" )
1687+ exec1DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec1-delay.fifo" ), 0600 )
1688+ require .NoError (t , err , "create exec1 delay fifo" )
1689+ exec1 , err := task .Exec (ctx , "exec1" , & specs.Process {
1690+ Args : []string {"/bin/sleep" , "301" },
1691+ Cwd : "/" ,
1692+ Env : []string {
1693+ failpoint .DelayExecReadyEnv + "=" + exec1ReadyFifo .Name (),
1694+ failpoint .DelayExecDelayEnv + "=" + exec1DelayFifo .Name (),
1695+ },
1696+ }, cio .NullIO )
1697+ require .NoError (t , err , "create exec1" )
1698+
1699+ exec1done := make (chan struct {})
1700+ go func () {
1701+ defer close (exec1done )
1702+ t .Log ("Starting exec1" )
1703+ err := exec1 .Start (ctx )
1704+ assert .Error (t , err , "start exec1" )
1705+ t .Logf ("error starting exec1: %s" , err )
1706+ }()
1707+
1708+ // 3. Wait until the exec is waiting to start
1709+ t .Log ("3. Wait until exec1 is waiting to start" )
1710+ err = exec1ReadyFifo .Wait ()
1711+ require .NoError (t , err , "open exec1 fifo" )
1712+
1713+ // 4. Kill the container init process
1714+ t .Log ("4. Kill the container init process" )
1715+ target := task .Pid ()
1716+ t .Logf ("Killing main pid (%v) of container %s" , target , container .ID ())
1717+ syscall .Kill (int (target ), syscall .SIGKILL )
1718+ status , err = task .Status (ctx )
1719+ require .NoError (t , err , "container status" )
1720+ t .Log ("container status" , status .Status )
1721+
1722+ // 5. Make an exec (2) using this failpoint
1723+ t .Log ("5. Create exec2" )
1724+ exec2ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec2-ready.fifo" ), 0600 )
1725+ require .NoError (t , err , "create exec2 ready fifo: %q" , exec2ReadyFifo )
1726+ exec2DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec2-delay.fifo" ), 0600 )
1727+ require .NoError (t , err , "create exec2 delay fifo: %q" , exec2DelayFifo )
1728+ exec2 , err := task .Exec (ctx , "exec2" , & specs.Process {
1729+ Args : []string {"/bin/sleep" , "302" },
1730+ Cwd : "/" ,
1731+ Env : []string {
1732+ failpoint .DelayExecReadyEnv + "=" + exec2ReadyFifo .Name (),
1733+ failpoint .DelayExecDelayEnv + "=" + exec2DelayFifo .Name (),
1734+ },
1735+ }, cio .NullIO )
1736+ require .NoError (t , err , "create exec2" )
1737+
1738+ exec2done := make (chan struct {})
1739+ didExec2Run := true
1740+ go func () {
1741+ defer close (exec2done )
1742+ t .Log ("Starting exec2" )
1743+ err := exec2 .Start (ctx )
1744+ assert .Error (t , err , "start exec2" )
1745+ t .Logf ("error starting exec2: %s" , err )
1746+ }()
1747+
1748+ // 6. Wait until the exec is waiting to start
1749+ t .Log ("6. Wait until exec2 is waiting to start" )
1750+ exec2ready := make (chan struct {})
1751+ go func () {
1752+ exec2ReadyFifo .Wait ()
1753+ close (exec2ready )
1754+ }()
1755+ select {
1756+ case <- exec2ready :
1757+ case <- exec2done :
1758+ didExec2Run = false
1759+ }
1760+
1761+ // 7. Allow exec=1 to proceed
1762+ t .Log ("7. Allow exec=1 to proceed" )
1763+ err = exec1DelayFifo .Trigger ()
1764+ assert .NoError (t , err , "trigger exec1 fifo" )
1765+ status , err = task .Status (ctx )
1766+ require .NoError (t , err , "container status" )
1767+ t .Log ("container status" , status .Status )
1768+ <- exec1done
1769+ status , err = task .Status (ctx )
1770+ require .NoError (t , err , "container status" )
1771+ t .Log ("container status" , status .Status )
1772+
1773+ // 8. Allow exec=2 to proceed
1774+ if didExec2Run {
1775+ t .Log ("8. Allow exec2 to proceed" )
1776+ err = exec2DelayFifo .Trigger ()
1777+ assert .NoError (t , err , "trigger exec2 fifo" )
1778+ status , err = task .Status (ctx )
1779+ require .NoError (t , err , "container status" )
1780+ t .Log ("container status" , status .Status )
1781+ <- exec2done
1782+ status , err = task .Status (ctx )
1783+ require .NoError (t , err , "container status" )
1784+ t .Log ("container status" , status .Status )
1785+ } else {
1786+ t .Log ("8. Skip exec2" )
1787+ }
1788+
1789+ // 9. Validate
1790+ t .Log ("9. Validate" )
1791+ status , err = exec1 .Status (ctx )
1792+ require .NoError (t , err , "exec1 status" )
1793+ t .Logf ("exec1 status: %s" , status .Status )
1794+ assert .Equal (t , Created , status .Status )
1795+ status , err = exec2 .Status (ctx )
1796+ require .NoError (t , err , "exec2 status" )
1797+ t .Logf ("exec2 status: %s" , status .Status )
1798+ assert .Equal (t , Created , status .Status )
1799+ status , err = task .Status (ctx )
1800+ t .Logf ("task status: %s" , status .Status )
1801+ require .NoError (t , err , "container status" )
1802+ assert .Equal (t , Stopped , status .Status )
1803+ }
0 commit comments