On an idle system, after a few days, containers on that system have begun to enter an UNKNOWN state and then shortly after they are no longer UNKNOWN but other new containers enter the UNKNOWN state.
the zombie processes seem to stem from a containerd-shim process.
idle system that was installed successfully/healthy went into an unstable state after a few days.
{
"status": {
"conditions": [
{
"type": "RuntimeReady",
"status": true,
"reason": "",
"message": ""
},
{
"type": "NetworkReady",
"status": true,
"reason": "",
"message": ""
}
]
},
"cniconfig": {
"PluginDirs": [
"/opt/cni/bin"
],
"PluginConfDir": "/etc/cni/net.d",
"PluginMaxConfNum": 1,
"Prefix": "eth",
"Networks": [
{
"Config": {
"Name": "cni-loopback",
"CNIVersion": "0.3.1",
"Plugins": [
{
"Network": {
"type": "loopback",
"ipam": {},
"dns": {}
},
"Source": "{\"type\":\"loopback\"}"
}
],
"Source": "{\n\"cniVersion\": \"0.3.1\",\n\"name\": \"cni-loopback\",\n\"plugins\": [{\n \"type\": \"loopback\"\n}]\n}"
},
"IFName": "lo"
},
{
"Config": {
"Name": "k8s-pod-network",
"CNIVersion": "0.3.1",
"Plugins": [
{
"Network": {
"type": "calico",
"ipam": {
"type": "calico-ipam"
},
"dns": {}
},
"Source": "{\"datastore_type\":\"kubernetes\",\"ipam\":{\"type\":\"calico-ipam\"},\"kubernetes\":{\"kubeconfig\":\"/etc/cni/net.d/calico-kubeconfig\"},\"log_level\":\"info\",\"mtu\":1440,\"nodename\":\"apicdev3176\",\"policy\":{\"type\":\"k8s\"},\"type\":\"calico\"}"
},
{
"Network": {
"type": "portmap",
"capabilities": {
"portMappings": true
},
"ipam": {},
"dns": {}
},
"Source": "{\"capabilities\":{\"portMappings\":true},\"snat\":true,\"type\":\"portmap\"}"
},
{
"Network": {
"type": "bandwidth",
"capabilities": {
"bandwidth": true
},
"ipam": {},
"dns": {}
},
"Source": "{\"capabilities\":{\"bandwidth\":true},\"type\":\"bandwidth\"}"
}
],
"Source": "{\n \"name\": \"k8s-pod-network\",\n \"cniVersion\": \"0.3.1\",\n \"plugins\": [\n {\n \"type\": \"calico\",\n \"log_level\": \"info\",\n \"datastore_type\": \"kubernetes\",\n \"nodename\": \"apicdev3176\",\n \"mtu\": 1440,\n \"ipam\": {\n \"type\": \"calico-ipam\"\n },\n \"policy\": {\n \"type\": \"k8s\"\n },\n \"kubernetes\": {\n \"kubeconfig\": \"/etc/cni/net.d/calico-kubeconfig\"\n }\n },\n {\n \"type\": \"portmap\",\n \"snat\": true,\n \"capabilities\": {\"portMappings\": true}\n },\n {\n \"type\": \"bandwidth\",\n \"capabilities\": {\"bandwidth\": true}\n }\n ]\n}\n"
},
"IFName": "eth0"
}
]
},
"config": {
"containerd": {
"snapshotter": "overlayfs",
"defaultRuntimeName": "runc",
"defaultRuntime": {
"runtimeType": "",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": [],
"ContainerAnnotations": [],
"runtimeRoot": "",
"options": {},
"privileged_without_host_devices": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0
},
"untrustedWorkloadRuntime": {
"runtimeType": "",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": [],
"ContainerAnnotations": [],
"runtimeRoot": "",
"options": {},
"privileged_without_host_devices": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0
},
"runtimes": {
"runc": {
"runtimeType": "io.containerd.runc.v2",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": [],
"ContainerAnnotations": [],
"runtimeRoot": "",
"options": {
"BinaryName": "",
"CriuImagePath": "",
"CriuPath": "",
"CriuWorkPath": "",
"IoGid": 0,
"IoUid": 0,
"NoNewKeyring": false,
"NoPivotRoot": false,
"Root": "",
"ShimCgroup": "",
"SystemdCgroup": false
},
"privileged_without_host_devices": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0
}
},
"noPivot": false,
"disableSnapshotAnnotations": true,
"discardUnpackedLayers": false,
"ignoreRdtNotEnabledErrors": false
},
"cni": {
"binDir": "/opt/cni/bin",
"confDir": "/etc/cni/net.d",
"maxConfNum": 1,
"confTemplate": "",
"ipPref": ""
},
"registry": {
"configPath": "",
"mirrors": {},
"configs": {},
"auths": {},
"headers": {}
},
"imageDecryption": {
"keyModel": "node"
},
"disableTCPService": true,
"streamServerAddress": "127.0.0.1",
"streamServerPort": "0",
"streamIdleTimeout": "4h0m0s",
"enableSelinux": false,
"selinuxCategoryRange": 1024,
"sandboxImage": "registry.k8s.io/pause:3.6",
"statsCollectPeriod": 10,
"systemdCgroup": false,
"enableTLSStreaming": false,
"x509KeyPairStreaming": {
"tlsCertFile": "",
"tlsKeyFile": ""
},
"maxContainerLogSize": 16384,
"disableCgroup": false,
"disableApparmor": false,
"restrictOOMScoreAdj": false,
"maxConcurrentDownloads": 3,
"disableProcMount": false,
"unsetSeccompProfile": "",
"tolerateMissingHugetlbController": true,
"disableHugetlbController": true,
"device_ownership_from_security_context": false,
"ignoreImageDefinedVolumes": false,
"netnsMountsUnderStateDir": false,
"enableUnprivilegedPorts": false,
"enableUnprivilegedICMP": false,
"containerdRootDir": "/var/lib/containerd",
"containerdEndpoint": "/run/containerd/containerd.sock",
"rootDir": "/var/lib/containerd/io.containerd.grpc.v1.cri",
"stateDir": "/run/containerd/io.containerd.grpc.v1.cri"
},
"golang": "go1.19.7",
"lastCNILoadStatus": "OK",
"lastCNILoadStatus.default": "OK"
}
Description
On an idle system, after a few days, containers on that system have begun to enter an UNKNOWN state and then shortly after they are no longer UNKNOWN but other new containers enter the UNKNOWN state.
Aside from that, I also checked
topfor CPU usage and for zombie processes. CPU usage does spike but zombie processes are not accumulating, but rather it moves between 16-19 zombie processes at one timethe zombie processes seem to stem from a containerd-shim process.
Steps to reproduce the issue
N/A
Describe the results you received and expected
idle system that was installed successfully/healthy went into an unstable state after a few days.
What version of containerd are you using?
1.6.20
Any other relevant information
runc version 1.1.5
crictl info
uname -a
Show configuration if it is related to CRI plugin.