Skip to content

Commit f914edf

Browse files
jsturtevantmarosset
authored andcommitted
[cri] Handle Windows pod transitions gracefully
When the pods are transitioning there are several cases where containers might not be in valid state. There were several cases where the stats where failing hard but we should just continue on as they are transient and will be picked up again when kubelet queries for the stats again. Signed-off-by: James Sturtevant <[email protected]> Signed-off-by: Mark Rossetti <[email protected]>
1 parent ad9d1a8 commit f914edf

8 files changed

Lines changed: 150 additions & 43 deletions

pkg/cri/sbserver/sandbox_stats_linux.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/containerd/cgroups/v3"
2525
"github.com/containerd/cgroups/v3/cgroup1"
2626
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
27+
"github.com/containerd/containerd/errdefs"
2728
"github.com/containerd/containerd/log"
2829
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
2930
"github.com/containernetworking/plugins/pkg/ns"
@@ -37,7 +38,7 @@ func (c *criService) podSandboxStats(
3738
meta := sandbox.Metadata
3839

3940
if sandbox.Status.Get().State != sandboxstore.StateReady {
40-
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state", meta.ID)
41+
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state: %w", meta.ID, errdefs.ErrUnavailable)
4142
}
4243

4344
stats, err := metricsForSandbox(sandbox)

pkg/cri/sbserver/sandbox_stats_list.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@ import (
2020
"context"
2121
"fmt"
2222

23+
"github.com/containerd/containerd/errdefs"
24+
"github.com/containerd/containerd/log"
2325
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
26+
"github.com/hashicorp/go-multierror"
2427
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
2528
)
2629

@@ -31,16 +34,21 @@ func (c *criService) ListPodSandboxStats(
3134
) (*runtime.ListPodSandboxStatsResponse, error) {
3235
sandboxes := c.sandboxesForListPodSandboxStatsRequest(r)
3336

37+
var errs *multierror.Error
3438
podSandboxStats := new(runtime.ListPodSandboxStatsResponse)
3539
for _, sandbox := range sandboxes {
3640
sandboxStats, err := c.podSandboxStats(ctx, sandbox)
37-
if err != nil {
38-
return nil, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err)
41+
switch {
42+
case errdefs.IsUnavailable(err):
43+
log.G(ctx).WithField("podsandboxid", sandbox.ID).Debugf("failed to get pod sandbox stats, this is likely a transient error: %v", err)
44+
case err != nil:
45+
errs = multierror.Append(errs, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err))
46+
default:
47+
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
3948
}
40-
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
4149
}
4250

43-
return podSandboxStats, nil
51+
return podSandboxStats, errs.ErrorOrNil()
4452
}
4553

4654
func (c *criService) sandboxesForListPodSandboxStatsRequest(r *runtime.ListPodSandboxStatsRequest) []sandboxstore.Sandbox {

pkg/cri/sbserver/sandbox_stats_windows.go

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/Microsoft/hcsshim/hcn"
2727
"github.com/containerd/containerd/api/services/tasks/v1"
2828
"github.com/containerd/containerd/api/types"
29+
"github.com/containerd/containerd/errdefs"
2930
"github.com/containerd/containerd/log"
3031
containerstore "github.com/containerd/containerd/pkg/cri/store/container"
3132
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
@@ -36,12 +37,11 @@ import (
3637

3738
func (c *criService) podSandboxStats(
3839
ctx context.Context,
39-
sandbox sandboxstore.Sandbox,
40-
) (*runtime.PodSandboxStats, error) {
40+
sandbox sandboxstore.Sandbox) (*runtime.PodSandboxStats, error) {
4141
meta := sandbox.Metadata
4242

4343
if sandbox.Status.Get().State != sandboxstore.StateReady {
44-
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state", meta.ID)
44+
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state: %w", meta.ID, errdefs.ErrUnavailable)
4545
}
4646

4747
timestamp := time.Now()
@@ -62,7 +62,7 @@ func (c *criService) podSandboxStats(
6262

6363
statsMap, err := convertMetricsToWindowsStats(metrics, sandbox)
6464
if err != nil {
65-
return nil, err
65+
return nil, fmt.Errorf("failed to convert stats: %w", err)
6666
}
6767

6868
podCPU, containerStats, err := c.toPodSandboxStats(sandbox, statsMap, containers, timestamp)
@@ -72,12 +72,11 @@ func (c *criService) podSandboxStats(
7272
podSandboxStats.Windows.Cpu = podCPU.Cpu
7373
podSandboxStats.Windows.Memory = podCPU.Memory
7474
podSandboxStats.Windows.Containers = containerStats
75-
7675
podSandboxStats.Windows.Network = windowsNetworkUsage(ctx, sandbox, timestamp)
7776

7877
pidCount, err := c.getSandboxPidCount(ctx, sandbox)
7978
if err != nil {
80-
return nil, err
79+
return nil, fmt.Errorf("failed to get pid count: %w", err)
8180
}
8281

8382
podSandboxStats.Windows.Process = &runtime.WindowsProcessUsage{
@@ -131,11 +130,13 @@ func (c *criService) toPodSandboxStats(sandbox sandboxstore.Sandbox, statsMap ma
131130

132131
if cntr.Status.Get().State() != runtime.ContainerState_CONTAINER_RUNNING {
133132
// containers that are just created, in a failed state or exited (init containers) will not have stats
133+
log.L.Warnf("failed to get container stats since container %q is not in running state", cntr.ID)
134134
continue
135135
}
136136

137137
if containerMetric == nil {
138-
return nil, nil, fmt.Errorf("failed to find metrics for container with id %s: %w", cntr.ID, err)
138+
log.L.Warnf("no metrics found for container %q", cntr.ID)
139+
continue
139140
}
140141

141142
containerStats, err := c.convertToCRIStats(containerMetric)
@@ -144,7 +145,7 @@ func (c *criService) toPodSandboxStats(sandbox sandboxstore.Sandbox, statsMap ma
144145
}
145146

146147
// Calculate NanoCores for container
147-
if containerStats.Cpu.UsageCoreNanoSeconds != nil {
148+
if containerStats.Cpu != nil && containerStats.Cpu.UsageCoreNanoSeconds != nil {
148149
nanoCoreUsage := getUsageNanoCores(containerStats.Cpu.UsageCoreNanoSeconds.Value, cntr.Stats, containerStats.Cpu.Timestamp)
149150
containerStats.Cpu.UsageNanoCores = &runtime.UInt64Value{Value: nanoCoreUsage}
150151
}
@@ -180,7 +181,7 @@ func (c *criService) toPodSandboxStats(sandbox sandboxstore.Sandbox, statsMap ma
180181
}
181182

182183
// Calculate NanoCores for pod after adding containers cpu including the pods cpu
183-
if podRuntimeStats.Cpu.UsageCoreNanoSeconds != nil {
184+
if podRuntimeStats.Cpu != nil && podRuntimeStats.Cpu.UsageCoreNanoSeconds != nil {
184185
nanoCoreUsage := getUsageNanoCores(podRuntimeStats.Cpu.UsageCoreNanoSeconds.Value, sandbox.Stats, podRuntimeStats.Cpu.Timestamp)
185186
podRuntimeStats.Cpu.UsageNanoCores = &runtime.UInt64Value{Value: nanoCoreUsage}
186187
}
@@ -391,10 +392,16 @@ func (c *criService) getSandboxPidCount(ctx context.Context, sandbox sandboxstor
391392
// get process count inside PodSandbox for Windows
392393
task, err := sandbox.Container.Task(ctx, nil)
393394
if err != nil {
395+
if errdefs.IsNotFound(err) {
396+
return 0, nil
397+
}
394398
return 0, err
395399
}
396400
processes, err := task.Pids(ctx)
397401
if err != nil {
402+
if errdefs.IsNotFound(err) {
403+
return 0, nil
404+
}
398405
return 0, err
399406
}
400407
pidCount += uint64(len(processes))
@@ -416,6 +423,9 @@ func (c *criService) getSandboxPidCount(ctx context.Context, sandbox sandboxstor
416423

417424
processes, err := task.Pids(ctx)
418425
if err != nil {
426+
if errdefs.IsNotFound(err) {
427+
continue
428+
}
419429
return 0, err
420430
}
421431
pidCount += uint64(len(processes))

pkg/cri/sbserver/sandbox_stats_windows_test.go

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
9393
metrics map[string]*wstats.Statistics
9494
sandbox sandboxstore.Sandbox
9595
containers []containerstore.Container
96-
expectedPodStats expectedStats
96+
expectedPodStats *expectedStats
9797
expectedContainerStats []expectedStats
9898
expectError bool
9999
}{
@@ -102,7 +102,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
102102
metrics: map[string]*wstats.Statistics{},
103103
sandbox: sandboxstore.Sandbox{},
104104
containers: []containerstore.Container{},
105-
expectedPodStats: expectedStats{},
105+
expectedPodStats: &expectedStats{},
106106
expectedContainerStats: []expectedStats{},
107107
expectError: true,
108108
},
@@ -120,7 +120,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
120120
containers: []containerstore.Container{
121121
newContainer("c1", running, nil),
122122
},
123-
expectedPodStats: expectedStats{
123+
expectedPodStats: &expectedStats{
124124
UsageCoreNanoSeconds: 400,
125125
UsageNanoCores: 0,
126126
WorkingSetBytes: 40,
@@ -152,7 +152,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
152152
newContainer("c1", running, nil),
153153
newContainer("i1", running, nil),
154154
},
155-
expectedPodStats: expectedStats{
155+
expectedPodStats: &expectedStats{
156156
UsageCoreNanoSeconds: 600,
157157
UsageNanoCores: 0,
158158
WorkingSetBytes: 60,
@@ -186,7 +186,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
186186
newContainer("c1", running, nil),
187187
newContainer("i1", exitedValid, nil),
188188
},
189-
expectedPodStats: expectedStats{
189+
expectedPodStats: &expectedStats{
190190
UsageCoreNanoSeconds: 400,
191191
UsageNanoCores: 0,
192192
WorkingSetBytes: 40,
@@ -215,7 +215,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
215215
newContainer("c1", running, nil),
216216
newContainer("i1", exitedInvalid, nil),
217217
},
218-
expectedPodStats: expectedStats{
218+
expectedPodStats: &expectedStats{
219219
UsageCoreNanoSeconds: 400,
220220
UsageNanoCores: 0,
221221
WorkingSetBytes: 40,
@@ -246,7 +246,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
246246
UsageCoreNanoSeconds: 200,
247247
}),
248248
},
249-
expectedPodStats: expectedStats{
249+
expectedPodStats: &expectedStats{
250250
UsageCoreNanoSeconds: 800,
251251
UsageNanoCores: 400,
252252
WorkingSetBytes: 40,
@@ -275,7 +275,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
275275
UsageCoreNanoSeconds: 200,
276276
}),
277277
},
278-
expectedPodStats: expectedStats{
278+
expectedPodStats: &expectedStats{
279279
UsageCoreNanoSeconds: 400,
280280
UsageNanoCores: 200,
281281
WorkingSetBytes: 20,
@@ -304,7 +304,7 @@ func Test_criService_podSandboxStats(t *testing.T) {
304304
UsageCoreNanoSeconds: 200,
305305
}),
306306
},
307-
expectedPodStats: expectedStats{
307+
expectedPodStats: &expectedStats{
308308
UsageCoreNanoSeconds: 400,
309309
UsageNanoCores: 200,
310310
WorkingSetBytes: 20,
@@ -318,6 +318,32 @@ func Test_criService_podSandboxStats(t *testing.T) {
318318
},
319319
expectError: false,
320320
},
321+
{
322+
desc: "pod sandbox with a container that has no cpu shouldn't error",
323+
metrics: map[string]*wstats.Statistics{
324+
"c1": {},
325+
"s1": {},
326+
},
327+
sandbox: sandboxPod("s1", initialStatsTimestamp, 200),
328+
containers: []containerstore.Container{
329+
newContainer("c1", running, &stats.ContainerStats{
330+
Timestamp: initialStatsTimestamp,
331+
UsageCoreNanoSeconds: 200,
332+
}),
333+
},
334+
expectedPodStats: nil,
335+
expectedContainerStats: []expectedStats{},
336+
expectError: false,
337+
},
338+
{
339+
desc: "pod sandbox with no stats in metric mapp will fail",
340+
metrics: map[string]*wstats.Statistics{},
341+
sandbox: sandboxPod("s1", initialStatsTimestamp, 200),
342+
containers: []containerstore.Container{},
343+
expectedPodStats: nil,
344+
expectedContainerStats: []expectedStats{},
345+
expectError: true,
346+
},
321347
} {
322348
test := test
323349
t.Run(test.desc, func(t *testing.T) {
@@ -327,6 +353,13 @@ func Test_criService_podSandboxStats(t *testing.T) {
327353
return
328354
}
329355
assert.Nil(t, err)
356+
357+
if test.expectedPodStats == nil {
358+
assert.Nil(t, actualPodStats.Cpu)
359+
assert.Nil(t, actualPodStats.Memory)
360+
return
361+
}
362+
330363
assert.Equal(t, test.expectedPodStats.UsageCoreNanoSeconds, actualPodStats.Cpu.UsageCoreNanoSeconds.Value)
331364
assert.Equal(t, test.expectedPodStats.UsageNanoCores, actualPodStats.Cpu.UsageNanoCores.Value)
332365

pkg/cri/server/sandbox_stats_linux.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/containerd/cgroups/v3"
2525
"github.com/containerd/cgroups/v3/cgroup1"
2626
cgroupsv2 "github.com/containerd/cgroups/v3/cgroup2"
27+
"github.com/containerd/containerd/errdefs"
2728
"github.com/containerd/containerd/log"
2829
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
2930
"github.com/containernetworking/plugins/pkg/ns"
@@ -37,7 +38,7 @@ func (c *criService) podSandboxStats(
3738
meta := sandbox.Metadata
3839

3940
if sandbox.Status.Get().State != sandboxstore.StateReady {
40-
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state", meta.ID)
41+
return nil, fmt.Errorf("failed to get pod sandbox stats since sandbox container %q is not in ready state: %w", meta.ID, errdefs.ErrUnavailable)
4142
}
4243

4344
stats, err := metricsForSandbox(sandbox)

pkg/cri/server/sandbox_stats_list.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,12 @@ import (
2020
"context"
2121
"fmt"
2222

23+
"github.com/containerd/containerd/errdefs"
24+
"github.com/containerd/containerd/log"
2325
sandboxstore "github.com/containerd/containerd/pkg/cri/store/sandbox"
2426
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
27+
28+
"github.com/hashicorp/go-multierror"
2529
)
2630

2731
// ListPodSandboxStats returns stats of all ready sandboxes.
@@ -31,16 +35,21 @@ func (c *criService) ListPodSandboxStats(
3135
) (*runtime.ListPodSandboxStatsResponse, error) {
3236
sandboxes := c.sandboxesForListPodSandboxStatsRequest(r)
3337

38+
var errs *multierror.Error
3439
podSandboxStats := new(runtime.ListPodSandboxStatsResponse)
3540
for _, sandbox := range sandboxes {
3641
sandboxStats, err := c.podSandboxStats(ctx, sandbox)
37-
if err != nil {
38-
return nil, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err)
42+
switch {
43+
case errdefs.IsUnavailable(err):
44+
log.G(ctx).WithField("podsandboxid", sandbox.ID).Debugf("failed to get pod sandbox stats, this is likely a transient error: %v", err)
45+
case err != nil:
46+
errs = multierror.Append(errs, fmt.Errorf("failed to decode sandbox container metrics for sandbox %q: %w", sandbox.ID, err))
47+
default:
48+
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
3949
}
40-
podSandboxStats.Stats = append(podSandboxStats.Stats, sandboxStats)
4150
}
4251

43-
return podSandboxStats, nil
52+
return podSandboxStats, errs.ErrorOrNil()
4453
}
4554

4655
func (c *criService) sandboxesForListPodSandboxStatsRequest(r *runtime.ListPodSandboxStatsRequest) []sandboxstore.Sandbox {

0 commit comments

Comments
 (0)