Skip to content

Commit 01b9911

Browse files
committed
Add retry around wclayer operations for process isolated containers
This change adds a simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manor on RS5 (ws2019) which led to some very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths that bring about the errors we have observed aren't known. On 19h1+ this retry loop shouldn't be needed, but the logic is to leave the loop if everything succeeded so this is harmless and shouldn't need a version check. Signed-off-by: Daniel Canter <[email protected]>
1 parent 4775815 commit 01b9911

1 file changed

Lines changed: 45 additions & 9 deletions

File tree

internal/layers/layers.go

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"path/filepath"
1111

1212
hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
13+
"github.com/Microsoft/hcsshim/internal/hcserror"
1314
"github.com/Microsoft/hcsshim/internal/log"
1415
"github.com/Microsoft/hcsshim/internal/ospath"
1516
"github.com/Microsoft/hcsshim/internal/uvm"
@@ -84,21 +85,56 @@ func MountContainerLayers(ctx context.Context, layerFolders []string, guestRoot,
8485
}
8586
path := layerFolders[len(layerFolders)-1]
8687
rest := layerFolders[:len(layerFolders)-1]
87-
if err := wclayer.ActivateLayer(ctx, path); err != nil {
88-
return "", err
89-
}
90-
defer func() {
91-
if err != nil {
92-
_ = wclayer.DeactivateLayer(ctx, path)
88+
// Simple retry loop to handle some behavior on RS5. Loopback VHDs used to be mounted in a different manor on RS5 (ws2019) which led to some
89+
// very odd cases where things would succeed when they shouldn't have, or we'd simply timeout if an operation took too long. Many
90+
// parallel invocations of this code path and stressing the machine seem to bring out the issues, but all of the possible failure paths
91+
// that bring about the errors we have observed aren't known.
92+
//
93+
// On 19h1+ this *shouldn't* be needed, but the logic is to break if everything succeeded so this is harmless and shouldn't need a version check.
94+
var lErr error
95+
for i := 0; i < 5; i++ {
96+
lErr = func() (err error) {
97+
if err := wclayer.ActivateLayer(ctx, path); err != nil {
98+
return err
99+
}
100+
101+
defer func() {
102+
if err != nil {
103+
_ = wclayer.DeactivateLayer(ctx, path)
104+
}
105+
}()
106+
107+
return wclayer.PrepareLayer(ctx, path, rest)
108+
}()
109+
110+
if lErr != nil {
111+
// Common errors seen from the RS5 behavior mentioned above is ERROR_NOT_READY and ERROR_DEVICE_NOT_CONNECTED. The former occurs when HCS
112+
// tries to grab the volume path of the disk but it doesn't succeed, usually because the disk isn't actually mounted. DEVICE_NOT_CONNECTED
113+
// has been observed after launching multiple containers in parallel on a machine under high load. This has also been observed to be a trigger
114+
// for ERROR_NOT_READY as well.
115+
if hcserr, ok := lErr.(*hcserror.HcsError); ok {
116+
if hcserr.Err == windows.ERROR_NOT_READY || hcserr.Err == windows.ERROR_DEVICE_NOT_CONNECTED {
117+
continue
118+
}
119+
}
120+
// This was a failure case outside of the commonly known error conditions, don't retry here.
121+
return "", lErr
93122
}
94-
}()
95123

96-
if err := wclayer.PrepareLayer(ctx, path, rest); err != nil {
97-
return "", err
124+
// No errors in layer setup, we can leave the loop
125+
break
98126
}
127+
// If we got unlucky and ran into one of the two errors mentioned five times in a row and left the loop, we need to check
128+
// the loop error here and fail also.
129+
if lErr != nil {
130+
return "", errors.Wrap(lErr, "layer retry loop failed")
131+
}
132+
133+
// If any of the below fails, we want to detach the filter and unmount the disk.
99134
defer func() {
100135
if err != nil {
101136
_ = wclayer.UnprepareLayer(ctx, path)
137+
_ = wclayer.DeactivateLayer(ctx, path)
102138
}
103139
}()
104140

0 commit comments

Comments
 (0)