-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Description
Description
When creating and deleting pod frequently with high IO load, there are lots of shim leak.

The sandbox and container delete successfully with retry, but there is no log for shim cleanup
Leak shim log:
Aug 07 17:30:30 10-129-111-17. containerd[122549]: time="2023-08-07T17:30:30.900156219+08:00" level=info msg="StopPodSandbox for \"15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37\""
Aug 07 17:30:33 10-129-111-17. containerd[122549]: time="2023-08-07T17:30:33.488242203+08:00" level=debug msg="received exit event &TaskExit{ContainerID:15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37,ID:15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37,Pid:264527,ExitStatus:137,ExitedAt:2023-08-07 09:30:33.488097873 +0000 UTC,XXX_unrecognized:[],}"
Aug 07 17:30:43 10-129-111-17. containerd[122549]: time="2023-08-07T17:30:43.488999573+08:00" level=debug msg="failed to delete task" error="context deadline exceeded" id=15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37
Aug 07 17:30:43 10-129-111-17 containerd[122549]: time="2023-08-07T17:30:43.489129568+08:00" level=error msg="failed to handle sandbox TaskExit event &TaskExit{ContainerID:15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37,ID:15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37,Pid:264527,ExitStatus:137,ExitedAt:2023-08-07 09:30:33.488097873 +0000 UTC,XXX_unrecognized:[],}" error="failed to stop sandbox: context deadline exceeded: unknown"
Aug 07 17:30:53 10-129-111-17 containerd[122549]: time="2023-08-07T17:30:53.447062203+08:00" level=info msg="TaskExit event &TaskExit{ContainerID:15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37,ID:15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37,Pid:264527,ExitStatus:137,ExitedAt:2023-08-07 09:30:33.488097873 +0000 UTC,XXX_unrecognized:[],}"
Aug 07 17:30:53 10-129-111-17 containerd[122549]: time="2023-08-07T17:30:53.527417390+08:00" level=info msg="TearDown network for sandbox \"15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37\" successfully"
Aug 07 17:30:53 10-129-111-17 containerd[122549]: time="2023-08-07T17:30:53.527436317+08:00" level=info msg="StopPodSandbox for \"15b079e7d06240afa8ecd3747e3e91c8da25fa38fa06c4ebc75292d90b8a7c37\" returns successfully"
Normal one
Aug 07 16:58:00 10-129-111-17. containerd[71859]: time="2023-08-07T16:58:00.746114952+08:00" level=info msg="StopPodSandbox for \"02a42716d3c84953e03421e5ea49f58a53744ba4071dfabf93e0a97f877a4b92\""
Aug 07 16:58:00 10-129-111-17. containerd[71859]: time="2023-08-07T16:58:00.772147284+08:00" level=debug msg="received exit event &TaskExit{ContainerID:02a42716d3c84953e03421e5ea49f58a53744ba4071dfabf93e0a97f877a4b92,ID:02a42716d3c84953e03421e5ea49f58a53744ba4071dfabf93e0a97f877a4b92,Pid:184049,ExitStatus:137,ExitedAt:2023-08-07 08:58:00.772018873 +0000 UTC,XXX_unrecognized:[],}"
Aug 07 16:58:00 10-129-111-17. containerd[71859]: time="2023-08-07T16:58:00.987571479+08:00" level=info msg="shim disconnected" id=02a42716d3c84953e03421e5ea49f58a53744ba4071dfabf93e0a97f877a4b92
Aug 07 16:58:00 10-129-111-17. containerd[71859]: time="2023-08-07T16:58:00.987627875+08:00" level=warning msg="cleaning up after shim disconnected" id=02a42716d3c84953e03421e5ea49f58a53744ba4071dfabf93e0a97f877a4b92 namespace=k8s.io
Aug 07 16:58:03 10-129-111-17. containerd[71859]: time="2023-08-07T16:58:01+08:00" level=info msg="Processing CNI request" file="handler.go:264" action=teardown netns=/var/run/netns/cni-8eb087b0-1d09-4f32-22bf-166584bf64ed sandbox_id=02a42716d3c84953e03421e5ea49f58a53744ba4071dfabf93e0a97f877a4b92
From what I observed, all those cases happened when delete shim failed in the first time here
https://github.com/containerd/containerd/blob/release/1.6/runtime/v2/shim.go#L275
then enter the backoff queue
https://github.com/containerd/containerd/blob/release/1.6/pkg/cri/server/events.go#L284
It seems the backoff logic didn't handle the shim disconnect logic right?
Steps to reproduce the issue
- Keep the node IO busy
- Creating and deleting pod frequently
Describe the results you received and expected
I am quite new to Containerd maybe someone can tell me
- why the shim is not carefully killed or why the client shim is not close in this situation
- how to fix this issue or guide me to fix it
What version of containerd are you using?
v1.5.9 v1.6.8
Any other relevant information
Runc:
runc version 1.0.1
commit: v1.0.1-3-g0b17c43
spec: 1.0.2-dev
go: go1.16.6
libseccomp: 2.5.1
crictl info
"status": {
"conditions": [
{
"type": "RuntimeReady",
"status": true,
"reason": "",
"message": ""
},
{
"type": "NetworkReady",
"status": true,
"reason": "",
"message": ""
}
]
},
"cniconfig": {
"PluginDirs": [
"/opt/cni/bin"
],
"PluginConfDir": "/etc/cni/net.d",
"PluginMaxConfNum": 1,
"Prefix": "eth",
"Networks": [
{
"Config": {
"Name": "cni-loopback",
"CNIVersion": "0.3.1",
"Plugins": [
{
"Network": {
"type": "loopback",
"ipam": {},
"dns": {}
},
"Source": "{\"type\":\"loopback\"}"
}
],
"Source": "{\n\"cniVersion\": \"0.3.1\",\n\"name\": \"cni-loopback\",\n\"plugins\": [{\n \"type\": \"loopback\"\n}]\n}"
},
"IFName": "lo"
},
{
"Config": {
"Name": "anchor",
"CNIVersion": "0.3.1",
"Plugins": [
{
"Network": {
"type": "anchor-cni",
"capabilities": {
"bandwidth": true,
"io.kubernetes.cri.pod-annotations": true
},
"ipam": {},
"dns": {}
},
"Source": "{\"capabilities\":{\"bandwidth\":true,\"io.kubernetes.cri.pod-annotations\":true},\"mtu\":1430,\"sysctl\":{\"net.core.somaxconn\":\"65535\",\"net.ipv4.tcp_max_syn_backlog\":\"65535\",\"net.ipv4.tcp_retries2\":\"7\",\"net.ipv4.tcp_tw_reuse\":\"1\"},\"type\":\"anchor-cni\"}"
}
],
"Source": "{\n \"cniVersion\": \"0.3.1\",\n \"name\": \"anchor\",\n \"plugins\": [\n {\n \"capabilities\": {\n \"bandwidth\": true,\n \"io.kubernetes.cri.pod-annotations\": true\n },\n \"mtu\": 1430,\n \"sysctl\": {\n \"net.core.somaxconn\": \"65535\",\n \"net.ipv4.tcp_max_syn_backlog\": \"65535\",\n \"net.ipv4.tcp_retries2\": \"7\",\n \"net.ipv4.tcp_tw_reuse\": \"1\"\n },\n \"type\": \"anchor-cni\"\n }\n ]\n}"
},
"IFName": "eth0"
}
]
},
"config": {
"containerd": {
"snapshotter": "overlayfs",
"defaultRuntimeName": "runc",
"defaultRuntime": {
"runtimeType": "",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": null,
"privileged_without_host_devices": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0
},
"untrustedWorkloadRuntime": {
"runtimeType": "",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": null,
"privileged_without_host_devices": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0
},
"runtimes": {
"runc": {
"runtimeType": "io.containerd.runc.v2",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": {
"BinaryName": "",
"CriuImagePath": "",
"CriuPath": "",
"CriuWorkPath": "",
"IoGid": 0,
"IoUid": 0,
"NoNewKeyring": false,
"NoPivotRoot": false,
"Root": "",
"ShimCgroup": "",
"SystemdCgroup": false
},
"privileged_without_host_devices": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0
}
},
"noPivot": false,
"disableSnapshotAnnotations": true,
"discardUnpackedLayers": false,
"ignoreRdtNotEnabledErrors": false
},
"cni": {
"binDir": "/opt/cni/bin",
"confDir": "/etc/cni/net.d",
"maxConfNum": 1,
"confTemplate": "",
"ipPref": ""
},
"registry": {
"configPath": "",
"mirrors": null,
"configs": null,
"auths": null,
"headers": null
},
"imageDecryption": {
"keyModel": "node"
},
"disableTCPService": true,
"streamServerAddress": "127.0.0.1",
"streamServerPort": "0",
"streamIdleTimeout": "4h0m0s",
"enableSelinux": false,
"selinuxCategoryRange": 1024,
"sandboxImage": "k8s.gcr.io/pause:3.6",
"statsCollectPeriod": 10,
"systemdCgroup": false,
"enableTLSStreaming": false,
"x509KeyPairStreaming": {
"tlsCertFile": "",
"tlsKeyFile": ""
},
"maxContainerLogSize": 16384,
"disableCgroup": false,
"disableApparmor": false,
"restrictOOMScoreAdj": false,
"maxConcurrentDownloads": 3,
"disableProcMount": false,
"unsetSeccompProfile": "",
"tolerateMissingHugetlbController": true,
"disableHugetlbController": true,
"device_ownership_from_security_context": false,
"ignoreImageDefinedVolumes": false,
"netnsMountsUnderStateDir": false,
"enableUnprivilegedPorts": false,
"enableUnprivilegedICMP": false,
"containerdRootDir": "/var/lib/containerd",
"containerdEndpoint": "/run/containerd/containerd.sock",
"rootDir": "/var/lib/containerd/io.containerd.grpc.v1.cri",
"stateDir": "/run/containerd/io.containerd.grpc.v1.cri"
},
"golang": "go1.16.6",
"lastCNILoadStatus": "OK",
"lastCNILoadStatus.default": "OK"
}
Show configuration if it is related to CRI plugin.
disabled_plugins = []
imports = ["/etc/containerd/config.toml"]
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/containerd"
state = "/run/containerd"
version = 2
[cgroup]
path = ""
[debug]
address = "/run/containerd/debug.sock"
format = ""
gid = 0
level = "debug"
uid = 0
[grpc]
address = "/run/containerd/containerd.sock"
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
tcp_address = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
[metrics]
address = ":7083"
grpc_histogram = false
[plugins]
[plugins."io.containerd.gc.v1.scheduler"]
deletion_threshold = 0
mutation_threshold = 100
pause_threshold = 0.02
schedule_delay = "0s"
startup_delay = "100ms"
[plugins."io.containerd.grpc.v1.cri"]
disable_apparmor = false
disable_cgroup = false
disable_hugetlb_controller = true
disable_proc_mount = false
disable_tcp_service = true
enable_selinux = false
enable_tls_streaming = false
ignore_image_defined_volumes = false
max_concurrent_downloads = 3
max_container_log_line_size = 16384
netns_mounts_under_state_dir = false
restrict_oom_score_adj = false
sandbox_image = "k8s.gcr.io/pause:3.5"
selinux_category_range = 1024
stats_collect_period = 10
stream_idle_timeout = "4h0m0s"
stream_server_address = "127.0.0.1"
stream_server_port = "0"
systemd_cgroup = false
tolerate_missing_hugetlb_controller = true
unset_seccomp_profile = ""
[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/opt/cni/bin"
conf_dir = "/etc/cni/net.d"
conf_template = ""
max_conf_num = 1
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "runc"
disable_same_image_parallel_pull = false
disable_snapshot_annotations = true
discard_unpacked_layers = false
no_pivot = false
snapshotter = "overlayfs"
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
base_runtime_spec = ""
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = ""
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
base_runtime_spec = ""
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
BinaryName = ""
CriuImagePath = ""
CriuPath = ""
CriuWorkPath = ""
IoGid = 0
IoUid = 0
NoNewKeyring = false
NoPivotRoot = false
Root = ""
ShimCgroup = ""
SystemdCgroup = false
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
base_runtime_spec = ""
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = ""
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]
[plugins."io.containerd.grpc.v1.cri".image_decryption]
key_model = "node"
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = ""
[plugins."io.containerd.grpc.v1.cri".registry.auths]
[plugins."io.containerd.grpc.v1.cri".registry.configs]
[plugins."io.containerd.grpc.v1.cri".registry.headers]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
tls_cert_file = ""
tls_key_file = ""
[plugins."io.containerd.internal.v1.opt"]
path = "/opt/containerd"
[plugins."io.containerd.internal.v1.restart"]
interval = "10s"
[plugins."io.containerd.metadata.v1.bolt"]
content_sharing_policy = "shared"
[plugins."io.containerd.monitor.v1.cgroups"]
no_prometheus = false
[plugins."io.containerd.runtime.v1.linux"]
no_shim = false
runtime = "runc"
runtime_root = ""
shim = "containerd-shim"
shim_debug = true
[plugins."io.containerd.runtime.v2.task"]
platforms = ["linux/amd64"]
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.snapshotter.v1.aufs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.btrfs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.devmapper"]
async_remove = false
base_image_size = ""
pool_name = ""
root_path = ""
[plugins."io.containerd.snapshotter.v1.native"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.overlayfs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.zfs"]
root_path = ""
[proxy_plugins]
[stream_processors]
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar"
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar+gzip"
[timeouts]
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"
[ttrpc]
address = ""
gid = 0
uid = 0