@@ -37,14 +37,17 @@ import (
3737 . "github.com/containerd/containerd"
3838 "github.com/containerd/containerd/cio"
3939 "github.com/containerd/containerd/containers"
40+ "github.com/containerd/containerd/integration/failpoint"
4041 "github.com/containerd/containerd/oci"
42+ "github.com/containerd/containerd/pkg/fifosync"
4143 "github.com/containerd/containerd/plugin"
4244 "github.com/containerd/containerd/runtime/linux/runctypes"
4345 "github.com/containerd/containerd/runtime/v2/runc/options"
4446 "github.com/containerd/containerd/sys"
4547 "github.com/containerd/errdefs"
4648
4749 "github.com/opencontainers/runtime-spec/specs-go"
50+ "github.com/stretchr/testify/assert"
4851 "github.com/stretchr/testify/require"
4952 "golang.org/x/sys/unix"
5053)
@@ -1595,3 +1598,210 @@ func TestIssue9103(t *testing.T) {
15951598 })
15961599 }
15971600}
1601+
1602+ // TestIssue10589 is used as regression case for issue 10589.
1603+ //
1604+ // This issue was caused by a race between init exits and new exec process tracking inside the shim. The test operates
1605+ // by controlling the time between when the shim invokes "runc exec" and when the actual "runc exec" is triggered. This
1606+ // allows validating that races for shim state tracking between pre- and post-start of the exec process do not exist.
1607+ //
1608+ // The workflow is as follows:
1609+ // 1. Create a container as normal
1610+ // 2. Make an exec1 using runc-fp with delayexec
1611+ // 3. Wait until the exec is waiting to start (triggered by delayexec)
1612+ // 4. Kill the container init process (signalling it is easiest)
1613+ // 5. Make an exec2 using runc-fp with delayexec
1614+ // 6. Wait until the exec is waiting to start
1615+ // 7. Allow exec1 to proceed
1616+ // 8. Allow exec2 to proceed
1617+ // 9. See that the container has exited and all execs have exited too
1618+ //
1619+ // https://github.com/containerd/containerd/issues/10589
1620+ func TestIssue10589 (t * testing.T ) {
1621+ if f := os .Getenv ("RUNC_FLAVOR" ); f != "" && f != "runc" {
1622+ t .Skip ("test requires runc" )
1623+ }
1624+ if rt := os .Getenv ("TEST_RUNTIME" ); rt != "" && rt != plugin .RuntimeRuncV2 {
1625+ t .Skip ("test requires io.containerd.runc.v2" )
1626+ }
1627+
1628+ client , err := newClient (t , address )
1629+ require .NoError (t , err )
1630+ t .Cleanup (func () {
1631+ client .Close ()
1632+ })
1633+
1634+ var (
1635+ image Image
1636+ ctx , cancel = testContext (t )
1637+ id = t .Name ()
1638+ )
1639+ t .Cleanup (cancel )
1640+
1641+ image , err = client .GetImage (ctx , testImage )
1642+ require .NoError (t , err )
1643+
1644+ // 1. Create a sleeping container
1645+ t .Log ("1. Create a sleeping container" )
1646+ container , err := client .NewContainer (ctx , id ,
1647+ WithNewSnapshot (id , image ),
1648+ WithNewSpec (oci .WithImageConfig (image ),
1649+ withProcessArgs ("sleep" , "inf" ),
1650+ oci .WithAnnotations (map [string ]string {
1651+ "oci.runc.failpoint.profile" : "delayExec" ,
1652+ }),
1653+ ),
1654+ WithRuntime (client .Runtime (), & options.Options {
1655+ BinaryName : "runc-fp" ,
1656+ }),
1657+ )
1658+ require .NoError (t , err , "create container" )
1659+ t .Cleanup (func () {
1660+ ctx , cancel := context .WithTimeout (ctx , 10 * time .Second )
1661+ err := container .Delete (ctx , WithSnapshotCleanup )
1662+ if err != nil {
1663+ t .Log ("delete err" , err )
1664+ }
1665+ cancel ()
1666+ })
1667+
1668+ task , err := container .NewTask (ctx , empty ())
1669+ require .NoError (t , err , "create task" )
1670+ t .Cleanup (func () {
1671+ ctx , cancel := context .WithTimeout (ctx , 2 * time .Second )
1672+ st , err := task .Delete (ctx , WithProcessKill )
1673+ t .Log ("exit status" , st )
1674+ if err != nil {
1675+ t .Log ("kill err" , err )
1676+ }
1677+ cancel ()
1678+ })
1679+
1680+ err = task .Start (ctx )
1681+ require .NoError (t , err , "start container" )
1682+
1683+ status , err := task .Status (ctx )
1684+ require .NoError (t , err , "container status" )
1685+ require .Equal (t , Running , status .Status )
1686+
1687+ // 2. Create an exec
1688+ t .Log ("2. Create exec1" )
1689+ exec1ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec1-ready.fifo" ), 0600 )
1690+ require .NoError (t , err , "create exec1 ready fifo" )
1691+ exec1DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec1-delay.fifo" ), 0600 )
1692+ require .NoError (t , err , "create exec1 delay fifo" )
1693+ exec1 , err := task .Exec (ctx , "exec1" , & specs.Process {
1694+ Args : []string {"/bin/sleep" , "301" },
1695+ Cwd : "/" ,
1696+ Env : []string {
1697+ failpoint .DelayExecReadyEnv + "=" + exec1ReadyFifo .Name (),
1698+ failpoint .DelayExecDelayEnv + "=" + exec1DelayFifo .Name (),
1699+ },
1700+ }, cio .NullIO )
1701+ require .NoError (t , err , "create exec1" )
1702+
1703+ exec1done := make (chan struct {})
1704+ go func () {
1705+ defer close (exec1done )
1706+ t .Log ("Starting exec1" )
1707+ err := exec1 .Start (ctx )
1708+ assert .Error (t , err , "start exec1" )
1709+ t .Logf ("error starting exec1: %s" , err )
1710+ }()
1711+
1712+ // 3. Wait until the exec is waiting to start
1713+ t .Log ("3. Wait until exec1 is waiting to start" )
1714+ err = exec1ReadyFifo .Wait ()
1715+ require .NoError (t , err , "open exec1 fifo" )
1716+
1717+ // 4. Kill the container init process
1718+ t .Log ("4. Kill the container init process" )
1719+ target := task .Pid ()
1720+ t .Logf ("Killing main pid (%v) of container %s" , target , container .ID ())
1721+ syscall .Kill (int (target ), syscall .SIGKILL )
1722+ status , err = task .Status (ctx )
1723+ require .NoError (t , err , "container status" )
1724+ t .Log ("container status" , status .Status )
1725+
1726+ // 5. Make an exec (2) using this failpoint
1727+ t .Log ("5. Create exec2" )
1728+ exec2ReadyFifo , err := fifosync .NewWaiter (filepath .Join (t .TempDir (), "exec2-ready.fifo" ), 0600 )
1729+ require .NoError (t , err , "create exec2 ready fifo: %q" , exec2ReadyFifo )
1730+ exec2DelayFifo , err := fifosync .NewTrigger (filepath .Join (t .TempDir (), "exec2-delay.fifo" ), 0600 )
1731+ require .NoError (t , err , "create exec2 delay fifo: %q" , exec2DelayFifo )
1732+ exec2 , err := task .Exec (ctx , "exec2" , & specs.Process {
1733+ Args : []string {"/bin/sleep" , "302" },
1734+ Cwd : "/" ,
1735+ Env : []string {
1736+ failpoint .DelayExecReadyEnv + "=" + exec2ReadyFifo .Name (),
1737+ failpoint .DelayExecDelayEnv + "=" + exec2DelayFifo .Name (),
1738+ },
1739+ }, cio .NullIO )
1740+ require .NoError (t , err , "create exec2" )
1741+
1742+ exec2done := make (chan struct {})
1743+ didExec2Run := true
1744+ go func () {
1745+ defer close (exec2done )
1746+ t .Log ("Starting exec2" )
1747+ err := exec2 .Start (ctx )
1748+ assert .Error (t , err , "start exec2" )
1749+ t .Logf ("error starting exec2: %s" , err )
1750+ }()
1751+
1752+ // 6. Wait until the exec is waiting to start
1753+ t .Log ("6. Wait until exec2 is waiting to start" )
1754+ exec2ready := make (chan struct {})
1755+ go func () {
1756+ exec2ReadyFifo .Wait ()
1757+ close (exec2ready )
1758+ }()
1759+ select {
1760+ case <- exec2ready :
1761+ case <- exec2done :
1762+ didExec2Run = false
1763+ }
1764+
1765+ // 7. Allow exec=1 to proceed
1766+ t .Log ("7. Allow exec=1 to proceed" )
1767+ err = exec1DelayFifo .Trigger ()
1768+ assert .NoError (t , err , "trigger exec1 fifo" )
1769+ status , err = task .Status (ctx )
1770+ require .NoError (t , err , "container status" )
1771+ t .Log ("container status" , status .Status )
1772+ <- exec1done
1773+ status , err = task .Status (ctx )
1774+ require .NoError (t , err , "container status" )
1775+ t .Log ("container status" , status .Status )
1776+
1777+ // 8. Allow exec=2 to proceed
1778+ if didExec2Run {
1779+ t .Log ("8. Allow exec2 to proceed" )
1780+ err = exec2DelayFifo .Trigger ()
1781+ assert .NoError (t , err , "trigger exec2 fifo" )
1782+ status , err = task .Status (ctx )
1783+ require .NoError (t , err , "container status" )
1784+ t .Log ("container status" , status .Status )
1785+ <- exec2done
1786+ status , err = task .Status (ctx )
1787+ require .NoError (t , err , "container status" )
1788+ t .Log ("container status" , status .Status )
1789+ } else {
1790+ t .Log ("8. Skip exec2" )
1791+ }
1792+
1793+ // 9. Validate
1794+ t .Log ("9. Validate" )
1795+ status , err = exec1 .Status (ctx )
1796+ require .NoError (t , err , "exec1 status" )
1797+ t .Logf ("exec1 status: %s" , status .Status )
1798+ assert .Equal (t , Created , status .Status )
1799+ status , err = exec2 .Status (ctx )
1800+ require .NoError (t , err , "exec2 status" )
1801+ t .Logf ("exec2 status: %s" , status .Status )
1802+ assert .Equal (t , Created , status .Status )
1803+ status , err = task .Status (ctx )
1804+ t .Logf ("task status: %s" , status .Status )
1805+ require .NoError (t , err , "container status" )
1806+ assert .Equal (t , Stopped , status .Status )
1807+ }
0 commit comments