When a host is experiencing high I/O load there are cases that containerd-shim processes remain on the host even though the underlying processes have already vanished.
Even though, there have already been multiple reports of the same symptom, I went ahead and created another issue because it was requested here and in the hope to have this as the single remaing tracking issue?! Possible duplicates:
That the containerd-shim processes are removed as well.
{
"status": {
"conditions": [
{
"type": "RuntimeReady",
"status": true,
"reason": "",
"message": ""
},
{
"type": "NetworkReady",
"status": true,
"reason": "",
"message": ""
}
]
},
"cniconfig": {
"PluginDirs": [
"/opt/cni/bin"
],
"PluginConfDir": "/etc/cni/net.d",
"PluginMaxConfNum": 1,
"Prefix": "eth",
"Networks": [
{
"Config": {
"Name": "cni-loopback",
"CNIVersion": "0.3.1",
"Plugins": [
{
"Network": {
"type": "loopback",
"ipam": {},
"dns": {}
},
"Source": "{\"type\":\"loopback\"}"
}
],
"Source": "{\n\"cniVersion\": \"0.3.1\",\n\"name\": \"cni-loopback\",\n\"plugins\": [{\n \"type\": \"loopback\"\n}]\n}"
},
"IFName": "lo"
},
{
"Config": {
"Name": "k8s-pod-network",
"CNIVersion": "0.3.1",
"Plugins": [
{
"Network": {
"type": "calico",
"ipam": {
"type": "host-local"
},
"dns": {}
},
"Source": "{\"datastore_type\":\"kubernetes\",\"ipam\":{\"ranges\":[[{\"subnet\":\"usePodCidr\"}]],\"routes\":[{\"dst\":\"0.0.0.0/0\"}],\"type\":\"host-local\"},\"kubernetes\":{\"kubeconfig\":\"/etc/cni/net.d/calico-kubeconfig\"},\"log_level\":\"error\",\"mtu\":1500,\"nodename\":\"myhost\",\"policy\":{\"type\":\"k8s\"},\"type\":\"calico\"}"
},
{
"Network": {
"type": "portmap",
"capabilities": {
"portMappings": true
},
"ipam": {},
"dns": {}
},
"Source": "{\"capabilities\":{\"portMappings\":true},\"type\":\"portmap\"}"
}
],
"Source": "{\n \"name\": \"k8s-pod-network\",\n \"cniVersion\": \"0.3.1\",\n \"plugins\": [\n {\n \"type\": \"calico\",\n \"log_level\": \"error\",\n \"datastore_type\": \"kubernetes\",\n \"nodename\": \"myhost\",\n \"mtu\": 1500,\n \"ipam\": {\n \"type\": \"host-local\",\n \"ranges\": [\n [\n {\n \"subnet\": \"usePodCidr\"\n }\n ]\n ],\n \"routes\": [\n {\n \"dst\": \"0.0.0.0/0\"\n }\n ]\n },\n \"policy\": {\n \"type\": \"k8s\"\n },\n \"kubernetes\": {\n \"kubeconfig\": \"/etc/cni/net.d/calico-kubeconfig\"\n }\n },\n {\n \"type\": \"portmap\",\n \"capabilities\": {\"portMappings\": true}\n }\n ]\n}"
},
"IFName": "eth0"
}
]
},
"config": {
"containerd": {
"snapshotter": "overlayfs",
"defaultRuntimeName": "default",
"defaultRuntime": {
"runtimeType": "io.containerd.runtime.v1.linux",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": null,
"privileged_without_host_devices": false,
"privileged_without_host_devices_all_devices_allowed": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0,
"snapshotter": "",
"sandboxMode": ""
},
"untrustedWorkloadRuntime": {
"runtimeType": "",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": null,
"privileged_without_host_devices": false,
"privileged_without_host_devices_all_devices_allowed": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0,
"snapshotter": "",
"sandboxMode": ""
},
"runtimes": {
"default": {
"runtimeType": "io.containerd.runtime.v1.linux",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": null,
"privileged_without_host_devices": false,
"privileged_without_host_devices_all_devices_allowed": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0,
"snapshotter": "",
"sandboxMode": "podsandbox"
},
"runc": {
"runtimeType": "io.containerd.runc.v2",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": {
"BinaryName": "",
"CriuImagePath": "",
"CriuPath": "",
"CriuWorkPath": "",
"IoGid": 0,
"IoUid": 0,
"NoNewKeyring": false,
"NoPivotRoot": false,
"Root": "",
"ShimCgroup": "",
"SystemdCgroup": false
},
"privileged_without_host_devices": false,
"privileged_without_host_devices_all_devices_allowed": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0,
"snapshotter": "",
"sandboxMode": "podsandbox"
}
},
"noPivot": false,
"disableSnapshotAnnotations": true,
"discardUnpackedLayers": false,
"ignoreBlockIONotEnabledErrors": false,
"ignoreRdtNotEnabledErrors": false
},
"cni": {
"binDir": "/opt/cni/bin",
"confDir": "/etc/cni/net.d",
"maxConfNum": 1,
"setupSerially": false,
"confTemplate": "",
"ipPref": ""
},
"registry": {
"configPath": "",
"mirrors": null,
"configs": null,
"auths": null,
"headers": null
},
"imageDecryption": {
"keyModel": "node"
},
"disableTCPService": true,
"streamServerAddress": "127.0.0.1",
"streamServerPort": "10260",
"streamIdleTimeout": "4h0m0s",
"enableSelinux": false,
"selinuxCategoryRange": 1024,
"sandboxImage": "registry-emea.app.corpintra.net/caas/pause:3.9",
"statsCollectPeriod": 10,
"systemdCgroup": true,
"enableTLSStreaming": false,
"x509KeyPairStreaming": {
"tlsCertFile": "",
"tlsKeyFile": ""
},
"maxContainerLogSize": 4194304,
"disableCgroup": false,
"disableApparmor": false,
"restrictOOMScoreAdj": false,
"maxConcurrentDownloads": 3,
"disableProcMount": false,
"unsetSeccompProfile": "",
"tolerateMissingHugetlbController": true,
"disableHugetlbController": true,
"device_ownership_from_security_context": false,
"ignoreImageDefinedVolumes": false,
"netnsMountsUnderStateDir": false,
"enableUnprivilegedPorts": false,
"enableUnprivilegedICMP": false,
"enableCDI": false,
"cdiSpecDirs": [
"/etc/cdi",
"/var/run/cdi"
],
"imagePullProgressTimeout": "1m0s",
"drainExecSyncIOTimeout": "0s",
"containerdRootDir": "/var/lib/containerd",
"containerdEndpoint": "/run/containerd/containerd.sock",
"rootDir": "/var/lib/containerd/io.containerd.grpc.v1.cri",
"stateDir": "/run/containerd/io.containerd.grpc.v1.cri"
},
"golang": "go1.20.7",
"lastCNILoadStatus": "OK",
"lastCNILoadStatus.default": "OK"
}
Description
When a host is experiencing high I/O load there are cases that containerd-shim processes remain on the host even though the underlying processes have already vanished.
Even though, there have already been multiple reports of the same symptom, I went ahead and created another issue because it was requested here and in the hope to have this as the single remaing tracking issue?! Possible duplicates:
@fuweid did already create PR #8954 to address the initial issue. Nevertheless, I am still able to reproduce the issue with containerd in all versions ranging from 1.7.4 up to 1.7.8.
The issue appears in the wild on some of our Kubernetes clusters running on containerd v1.7.5, which have either high I/O load or a slow backing storage (which is mimiced by step 2 in the below
Steps to reproduce the issuesection).There is also a relevant issue that describes ideas to make the task.Delete API retriable: #8981
Logs from containerd when the problem arises:
Steps to reproduce the issue
Shameless plug of #7496 (comment)
kubectl run nginx --image=nginx --restart=Neversudo strace -p $(pidof target-shim) --trace=umount2 -f --detach-on=execve -e inject=umount2:delay_enter=12skill -9 $(pidof nginx)Describe the results you received and expected
That the containerd-shim processes are removed as well.
What version of containerd are you using?
containerd github.com/containerd/containerd v1.7.5 fe457eb
Any other relevant information
runc --version
crictl info
uname -a
Show configuration if it is related to CRI plugin.
Johannes Frey <[email protected]>, Mercedes-Benz Tech Innovation GmbH (Provider Information)