Skip to content

Commit 579d8b4

Browse files
jsturtevantdims
authored andcommitted
[cri] Handle Windows pod transitions gracefully
When the pods are transitioning there are several cases where containers might not be in valid state. There were several cases where the stats where failing hard but we should just continue on as they are transient and will be picked up again when kubelet queries for the stats again. Signed-off-by: James Sturtevant <[email protected]> Signed-off-by: Mark Rossetti <[email protected]>
1 parent 8d849a2 commit 579d8b4

8 files changed

Lines changed: 150 additions & 43 deletions

pkg/cri/sbserver/sandbox_stats_linux.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/containerd/cgroups/v3"
2525
"github.com/containerd/cgroups/v3/cgroup1"
2626
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
27+
"github.com/containerd/containerd/errdefs"
2728
"github.com/containerd/containerd/log"
2829
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
2930
"github.com/containernetworking/plugins/pkg/ns"
@@ -37,7 +38,7 @@ func (c *criService) podSandboxStats(
3738
meta := sandbox.Metadata
3839

3940
if sandbox.Status.Get().State != sandboxstore.StateReady {
40-
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state", meta.ID)
41+
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state: %w", meta.ID, errdefs.ErrUnavailable)
4142
}
4243

4344
stats, err := metricsForSandbox(sandbox)

pkg/cri/sbserver/sandbox_stats_list.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@ import (
2020
"context"
2121
"fmt"
2222

23+
"github.com/containerd/containerd/errdefs"
24+
"github.com/containerd/containerd/log"
2325
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
26+
"github.com/hashicorp/go-multierror"
2427
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
2528
)
2629

@@ -31,16 +34,21 @@ func (c *criService) ListPodSandboxStats(
3134
) (*runtime.ListPodSandboxStatsResponse, error) {
3235
sandboxes := c.sandboxesForListPodSandboxStatsRequest(r)
3336

37+
var errs *multierror.Error
3438
podSandboxStats := new(runtime.ListPodSandboxStatsResponse)
3539
for _, sandbox := range sandboxes {
3640
sandboxStats, err := c.podSandboxStats(ctx, sandbox)
37-
if err != nil {
38-
return nil, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err)
41+
switch {
42+
case errdefs.IsUnavailable(err):
43+
log.G(ctx).WithField("podsandboxid", sandbox.ID).Debugf("failed to get pod sandbox stats, this is likely a transient error: %v", err)
44+
case err != nil:
45+
errs = multierror.Append(errs, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err))
46+
default:
47+
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
3948
}
40-
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
4149
}
4250

43-
return podSandboxStats, nil
51+
return podSandboxStats, errs.ErrorOrNil()
4452
}
4553

4654
func (c *criService) sandboxesForListPodSandboxStatsRequest(r *runtime.ListPodSandboxStatsRequest) []sandboxstore.Sandbox {

pkg/cri/sbserver/sandbox_stats_windows.go

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/Microsoft/hcsshim/hcn"
2727
"github.com/containerd/containerd/api/services/tasks/v1"
2828
"github.com/containerd/containerd/api/types"
29+
"github.com/containerd/containerd/errdefs"
2930
"github.com/containerd/containerd/log"
3031
containerstore "github.com/containerd/containerd/pkg/cri/store/container"
3132
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
@@ -36,12 +37,11 @@ import (
3637

3738
func (c *criService) podSandboxStats(
3839
ctx context.Context,
39-
sandbox sandboxstore.Sandbox,
40-
) (*runtime.PodSandboxStats, error) {
40+
sandbox sandboxstore.Sandbox) (*runtime.PodSandboxStats, error) {
4141
meta := sandbox.Metadata
4242

4343
if sandbox.Status.Get().State != sandboxstore.StateReady {
44-
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state", meta.ID)
44+
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state: %w", meta.ID, errdefs.ErrUnavailable)
4545
}
4646

4747
timestamp := time.Now()
@@ -62,7 +62,7 @@ func (c *criService) podSandboxStats(
6262

6363
statsMap, err := convertMetricsToWindowsStats(metrics, sandbox)
6464
if err != nil {
65-
return nil, err
65+
return nil, fmt.Errorf("failed to convert stats: %w", err)
6666
}
6767

6868
podCPU, containerStats, err := c.toPodSandboxStats(sandbox, statsMap, containers, timestamp)
@@ -72,12 +72,11 @@ func (c *criService) podSandboxStats(
7272
podSandboxStats.Windows.Cpu = podCPU.Cpu
7373
podSandboxStats.Windows.Memory = podCPU.Memory
7474
podSandboxStats.Windows.Containers = containerStats
75-
7675
podSandboxStats.Windows.Network = windowsNetworkUsage(ctx, sandbox, timestamp)
7776

7877
pidCount, err := c.getSandboxPidCount(ctx, sandbox)
7978
if err != nil {
80-
return nil, err
79+
return nil, fmt.Errorf("failed to get pid count: %w", err)
8180
}
8281

8382
podSandboxStats.Windows.Process = &runtime.WindowsProcessUsage{
@@ -131,11 +130,13 @@ func (c *criService) toPodSandboxStats(sandbox sandboxstore.Sandbox, statsMap ma
131130

132131
if cntr.Status.Get().State() != runtime.ContainerState_CONTAINER_RUNNING {
133132
// containers that are just created, in a failed state or exited (init containers) will not have stats
133+
log.L.Warnf("failed to get container stats since container %q is not in running state", cntr.ID)
134134
continue
135135
}
136136

137137
if containerMetric == nil {
138-
return nil, nil, fmt.Errorf("failed to find metrics for container with id %s: %w", cntr.ID, err)
138+
log.L.Warnf("no metrics found for container %q", cntr.ID)
139+
continue
139140
}
140141

141142
containerStats, err := c.convertToCRIStats(containerMetric)
@@ -144,7 +145,7 @@ func (c *criService) toPodSandboxStats(sandbox sandboxstore.Sandbox, statsMap ma
144145
}
145146

146147
// Calculate NanoCores for container
147-
if containerStats.Cpu.UsageCoreNanoSeconds != nil {
148+
if containerStats.Cpu != nil && containerStats.Cpu.UsageCoreNanoSeconds != nil {
148149
nanoCoreUsage := getUsageNanoCores(containerStats.Cpu.UsageCoreNanoSeconds.Value, cntr.Stats, containerStats.Cpu.Timestamp)
149150
containerStats.Cpu.UsageNanoCores = &runtime.UInt64Value{Value: nanoCoreUsage}
150151
}
@@ -180,7 +181,7 @@ func (c *criService) toPodSandboxStats(sandbox sandboxstore.Sandbox, statsMap ma
180181
}
181182

182183
// Calculate NanoCores for pod after adding containers cpu including the pods cpu
183-
if podRuntimeStats.Cpu.UsageCoreNanoSeconds != nil {
184+
if podRuntimeStats.Cpu != nil && podRuntimeStats.Cpu.UsageCoreNanoSeconds != nil {
184185
nanoCoreUsage := getUsageNanoCores(podRuntimeStats.Cpu.UsageCoreNanoSeconds.Value, sandbox.Stats, podRuntimeStats.Cpu.Timestamp)
185186
podRuntimeStats.Cpu.UsageNanoCores = &runtime.UInt64Value{Value: nanoCoreUsage}
186187
}
@@ -391,10 +392,16 @@ func (c *criService) getSandboxPidCount(ctx context.Context, sandbox sandboxstor
391392
// get process count inside PodSandbox for Windows
392393
task, err := sandbox.Container.Task(ctx, nil)
393394
if err != nil {
395+
if errdefs.IsNotFound(err) {
396+
return 0, nil
397+
}
394398
return 0, err
395399
}
396400
processes, err := task.Pids(ctx)
397401
if err != nil {
402+
if errdefs.IsNotFound(err) {
403+
return 0, nil
404+
}
398405
return 0, err
399406
}
400407
pidCount += uint64(len(processes))
@@ -416,6 +423,9 @@ func (c *criService) getSandboxPidCount(ctx context.Context, sandbox sandboxstor
416423

417424
processes, err := task.Pids(ctx)
418425
if err != nil {
426+
if errdefs.IsNotFound(err) {
427+
continue
428+
}
419429
return 0, err
420430
}
421431
pidCount += uint64(len(processes))

pkg/cri/sbserver/sandbox_stats_windows_test.go

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,15 @@ func Test_criService_podSandboxStats(t *testing.T) {
8989
metrics map[string]*wstats.Statistics
9090
sandbox sandboxstore.Sandbox
9191
containers []containerstore.Container
92-
expectedPodStats expectedStats
92+
expectedPodStats *expectedStats
9393
expectedContainerStats []expectedStats
9494
expectError bool
9595
}{
9696
"no metrics found should return error": {
9797
metrics: map[string]*wstats.Statistics{},
9898
sandbox: sandboxstore.Sandbox{},
9999
containers: []containerstore.Container{},
100-
expectedPodStats: expectedStats{},
100+
expectedPodStats: &expectedStats{},
101101
expectedContainerStats: []expectedStats{},
102102
expectError: true,
103103
},
@@ -114,7 +114,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
114114
containers: []containerstore.Container{
115115
newContainer("c1", running, nil),
116116
},
117-
expectedPodStats: expectedStats{
117+
expectedPodStats: &expectedStats{
118118
UsageCoreNanoSeconds: 400,
119119
UsageNanoCores: 0,
120120
WorkingSetBytes: 40,
@@ -145,7 +145,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
145145
newContainer("c1", running, nil),
146146
newContainer("i1", running, nil),
147147
},
148-
expectedPodStats: expectedStats{
148+
expectedPodStats: &expectedStats{
149149
UsageCoreNanoSeconds: 600,
150150
UsageNanoCores: 0,
151151
WorkingSetBytes: 60,
@@ -178,7 +178,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
178178
newContainer("c1", running, nil),
179179
newContainer("i1", exitedValid, nil),
180180
},
181-
expectedPodStats: expectedStats{
181+
expectedPodStats: &expectedStats{
182182
UsageCoreNanoSeconds: 400,
183183
UsageNanoCores: 0,
184184
WorkingSetBytes: 40,
@@ -206,7 +206,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
206206
newContainer("c1", running, nil),
207207
newContainer("i1", exitedInvalid, nil),
208208
},
209-
expectedPodStats: expectedStats{
209+
expectedPodStats: &expectedStats{
210210
UsageCoreNanoSeconds: 400,
211211
UsageNanoCores: 0,
212212
WorkingSetBytes: 40,
@@ -236,7 +236,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
236236
UsageCoreNanoSeconds: 200,
237237
}),
238238
},
239-
expectedPodStats: expectedStats{
239+
expectedPodStats: &expectedStats{
240240
UsageCoreNanoSeconds: 800,
241241
UsageNanoCores: 400,
242242
WorkingSetBytes: 40,
@@ -264,7 +264,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
264264
UsageCoreNanoSeconds: 200,
265265
}),
266266
},
267-
expectedPodStats: expectedStats{
267+
expectedPodStats: &expectedStats{
268268
UsageCoreNanoSeconds: 400,
269269
UsageNanoCores: 200,
270270
WorkingSetBytes: 20,
@@ -292,7 +292,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
292292
UsageCoreNanoSeconds: 200,
293293
}),
294294
},
295-
expectedPodStats: expectedStats{
295+
expectedPodStats: &expectedStats{
296296
UsageCoreNanoSeconds: 400,
297297
UsageNanoCores: 200,
298298
WorkingSetBytes: 20,
@@ -306,6 +306,32 @@ func Test_criService_podSandboxStats(t *testing.T) {
306306
},
307307
expectError: false,
308308
},
309+
{
310+
desc: "pod sandbox with a container that has no cpu shouldn't error",
311+
metrics: map[string]*wstats.Statistics{
312+
"c1": {},
313+
"s1": {},
314+
},
315+
sandbox: sandboxPod("s1", initialStatsTimestamp, 200),
316+
containers: []containerstore.Container{
317+
newContainer("c1", running, &stats.ContainerStats{
318+
Timestamp: initialStatsTimestamp,
319+
UsageCoreNanoSeconds: 200,
320+
}),
321+
},
322+
expectedPodStats: nil,
323+
expectedContainerStats: []expectedStats{},
324+
expectError: false,
325+
},
326+
{
327+
desc: "pod sandbox with no stats in metric mapp will fail",
328+
metrics: map[string]*wstats.Statistics{},
329+
sandbox: sandboxPod("s1", initialStatsTimestamp, 200),
330+
containers: []containerstore.Container{},
331+
expectedPodStats: nil,
332+
expectedContainerStats: []expectedStats{},
333+
expectError: true,
334+
},
309335
} {
310336
t.Run(desc, func(t *testing.T) {
311337
actualPodStats, actualContainerStats, err := c.toPodSandboxStats(test.sandbox, test.metrics, test.containers, currentStatsTimestamp)
@@ -314,6 +340,13 @@ func Test_criService_podSandboxStats(t *testing.T) {
314340
return
315341
}
316342
assert.Nil(t, err)
343+
344+
if test.expectedPodStats == nil {
345+
assert.Nil(t, actualPodStats.Cpu)
346+
assert.Nil(t, actualPodStats.Memory)
347+
return
348+
}
349+
317350
assert.Equal(t, test.expectedPodStats.UsageCoreNanoSeconds, actualPodStats.Cpu.UsageCoreNanoSeconds.Value)
318351
assert.Equal(t, test.expectedPodStats.UsageNanoCores, actualPodStats.Cpu.UsageNanoCores.Value)
319352

pkg/cri/server/sandbox_stats_linux.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/containerd/cgroups/v3"
2525
"github.com/containerd/cgroups/v3/cgroup1"
2626
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
27+
"github.com/containerd/containerd/errdefs"
2728
"github.com/containerd/containerd/log"
2829
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
2930
"github.com/containernetworking/plugins/pkg/ns"
@@ -37,7 +38,7 @@ func (c *criService) podSandboxStats(
3738
meta := sandbox.Metadata
3839

3940
if sandbox.Status.Get().State != sandboxstore.StateReady {
40-
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state", meta.ID)
41+
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state: %w", meta.ID, errdefs.ErrUnavailable)
4142
}
4243

4344
stats, err := metricsForSandbox(sandbox)

pkg/cri/server/sandbox_stats_list.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,12 @@ import (
2020
"context"
2121
"fmt"
2222

23+
"github.com/containerd/containerd/errdefs"
24+
"github.com/containerd/containerd/log"
2325
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
2426
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
27+
28+
"github.com/hashicorp/go-multierror"
2529
)
2630

2731
// ListPodSandboxStats returns stats of all ready sandboxes.
@@ -31,16 +35,21 @@ func (c *criService) ListPodSandboxStats(
3135
) (*runtime.ListPodSandboxStatsResponse, error) {
3236
sandboxes := c.sandboxesForListPodSandboxStatsRequest(r)
3337

38+
var errs *multierror.Error
3439
podSandboxStats := new(runtime.ListPodSandboxStatsResponse)
3540
for _, sandbox := range sandboxes {
3641
sandboxStats, err := c.podSandboxStats(ctx, sandbox)
37-
if err != nil {
38-
return nil, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err)
42+
switch {
43+
case errdefs.IsUnavailable(err):
44+
log.G(ctx).WithField("podsandboxid", sandbox.ID).Debugf("failed to get pod sandbox stats, this is likely a transient error: %v", err)
45+
case err != nil:
46+
errs = multierror.Append(errs, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err))
47+
default:
48+
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
3949
}
40-
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
4150
}
4251

43-
return podSandboxStats, nil
52+
return podSandboxStats, errs.ErrorOrNil()
4453
}
4554

4655
func (c *criService) sandboxesForListPodSandboxStatsRequest(r *runtime.ListPodSandboxStatsRequest) []sandboxstore.Sandbox {

0 commit comments

Comments
 (0)