When I restart containerd, because k3s depends on containerd, systemd shuts down k3s as well. There are a few containers (csi-nfs-controller and snapshot-controller) that will fail if they lose connection to the K8 API, which they do when K3s restarts. Normally this would be fine, but because containerd is down at the time, something gets messed up such that I end up getting the CreateContainerError: failed to reserve container name error every few seconds, which never resolves (I've left it running for 24 hours). I'm assuming something about a container failing while both k3s and containerd is offline causes a data issue. I had some of my own containers that would error out when I lost connection to k3s, but I've now patched those to catch the failure and retry every 3s instead, and since then those images no longer get stuck after the containerd restart.
I'm running K3s with containerd on NixOS using ZFS storage.
Restarting containerd is the simplest way to reproduce this issue, but I've also seen it occur if I saturate the network (e.g. with a large download) such that timeouts start happening. I see context deadline exceeded errors in the logs in both cases, before it gets stuck with the reserved name errors. I would also appreciate being able to recover if a pod ends up saturating the network, instead of pods that fail during that being unrecoverable.
I would appreciate any workarounds ideas, even if this can't be reproduced or fixed. I wasn't sure how to edit the boltdb manually or if that could be used to help fix this issue.
{
"status": {
"conditions": [
{
"type": "RuntimeReady",
"status": true,
"reason": "",
"message": ""
},
{
"type": "NetworkReady",
"status": true,
"reason": "",
"message": ""
},
{
"type": "ContainerdHasNoDeprecationWarnings",
"status": false,
"reason": "ContainerdHasDeprecationWarnings",
"message": "{\"io.containerd.deprecation/cri-registry-configs\":\"The `configs` property of `[plugins.\\\"io.containerd.grpc.v1.cri\\\".registry]` is deprecated since containerd v1.5 and will be removed in containerd v2.0. Use `config_path` instead.\",\"io.containerd.deprecation/cri-registry-mirrors\":\"The `mirrors` property of `[plugins.\\\"io.containerd.grpc.v1.cri\\\".registry]` is deprecated since containerd v1.5 and will be removed in containerd v2.0. Use `config_path` instead.\"}"
}
]
},
"cniconfig": {
"PluginDirs": [
"/nix/store/lqs84pr7jyyacrd2hw862kr5rkw83dzc-full-cni/bin"
],
"PluginConfDir": "/var/lib/rancher/k3s/agent/etc/cni/net.d/",
"PluginMaxConfNum": 1,
"Prefix": "eth",
"Networks": [
{
"Config": {
"Name": "cni-loopback",
"CNIVersion": "0.3.1",
"Plugins": [
{
"Network": {
"type": "loopback",
"ipam": {},
"dns": {}
},
"Source": "{\"type\":\"loopback\"}"
}
],
"Source": "{\n\"cniVersion\": \"0.3.1\",\n\"name\": \"cni-loopback\",\n\"plugins\": [{\n \"type\": \"loopback\"\n}]\n}"
},
"IFName": "lo"
},
{
"Config": {
"Name": "cbr0",
"CNIVersion": "1.0.0",
"Plugins": [
{
"Network": {
"type": "flannel",
"ipam": {},
"dns": {}
},
"Source": "{\"delegate\":{\"forceAddress\":true,\"hairpinMode\":true,\"isDefaultGateway\":true},\"type\":\"flannel\"}"
},
{
"Network": {
"type": "portmap",
"capabilities": {
"portMappings": true
},
"ipam": {},
"dns": {}
},
"Source": "{\"capabilities\":{\"portMappings\":true},\"type\":\"portmap\"}"
},
{
"Network": {
"type": "bandwidth",
"capabilities": {
"bandwidth": true
},
"ipam": {},
"dns": {}
},
"Source": "{\"capabilities\":{\"bandwidth\":true},\"type\":\"bandwidth\"}"
}
],
"Source": "{\n \"name\":\"cbr0\",\n \"cniVersion\":\"1.0.0\",\n \"plugins\":[\n {\n \"type\":\"flannel\",\n \"delegate\":{\n \"hairpinMode\":true,\n \"forceAddress\":true,\n \"isDefaultGateway\":true\n }\n },\n {\n \"type\":\"portmap\",\n \"capabilities\":{\n \"portMappings\":true\n }\n },\n {\n \"type\":\"bandwidth\",\n \"capabilities\":{\n \"bandwidth\":true\n }\n }\n ]\n}\n"
},
"IFName": "eth0"
}
]
},
"config": {
"containerd": {
"snapshotter": "zfs",
"defaultRuntimeName": "runc",
"defaultRuntime": {
"runtimeType": "",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": null,
"privileged_without_host_devices": false,
"privileged_without_host_devices_all_devices_allowed": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0,
"snapshotter": "",
"sandboxMode": ""
},
"untrustedWorkloadRuntime": {
"runtimeType": "",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": null,
"privileged_without_host_devices": false,
"privileged_without_host_devices_all_devices_allowed": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0,
"snapshotter": "",
"sandboxMode": ""
},
"runtimes": {
"runc": {
"runtimeType": "io.containerd.runc.v2",
"runtimePath": "",
"runtimeEngine": "",
"PodAnnotations": null,
"ContainerAnnotations": null,
"runtimeRoot": "",
"options": {
"BinaryName": "",
"CriuImagePath": "",
"CriuPath": "",
"CriuWorkPath": "",
"IoGid": 0,
"IoUid": 0,
"NoNewKeyring": false,
"NoPivotRoot": false,
"Root": "",
"ShimCgroup": "",
"SystemdCgroup": false
},
"privileged_without_host_devices": false,
"privileged_without_host_devices_all_devices_allowed": false,
"baseRuntimeSpec": "",
"cniConfDir": "",
"cniMaxConfNum": 0,
"snapshotter": "",
"sandboxMode": "podsandbox"
}
},
"noPivot": false,
"disableSnapshotAnnotations": true,
"discardUnpackedLayers": false,
"ignoreBlockIONotEnabledErrors": false,
"ignoreRdtNotEnabledErrors": false
},
"cni": {
"binDir": "/nix/store/lqs84pr7jyyacrd2hw862kr5rkw83dzc-full-cni/bin",
"confDir": "/var/lib/rancher/k3s/agent/etc/cni/net.d/",
"maxConfNum": 1,
"setupSerially": false,
"confTemplate": "",
"ipPref": ""
},
"registry": {
"configPath": "",
"mirrors": {
"iolite.local:5000": {
"endpoint": [
"http://localhost:5000"
]
}
},
"configs": {
"registry.oycs.com": {
"auth": {
"username": "<censored>",
"password": "<censored>",
"auth": "",
"identitytoken": ""
},
"tls": null
}
},
"auths": null,
"headers": null
},
"imageDecryption": {
"keyModel": "node"
},
"disableTCPService": true,
"streamServerAddress": "127.0.0.1",
"streamServerPort": "0",
"streamIdleTimeout": "4h0m0s",
"enableSelinux": false,
"selinuxCategoryRange": 1024,
"sandboxImage": "registry.k8s.io/pause:3.8",
"statsCollectPeriod": 10,
"systemdCgroup": false,
"enableTLSStreaming": false,
"x509KeyPairStreaming": {
"tlsCertFile": "",
"tlsKeyFile": ""
},
"maxContainerLogSize": 16384,
"disableCgroup": false,
"disableApparmor": false,
"restrictOOMScoreAdj": false,
"maxConcurrentDownloads": 3,
"disableProcMount": false,
"unsetSeccompProfile": "",
"tolerateMissingHugetlbController": true,
"disableHugetlbController": true,
"device_ownership_from_security_context": false,
"ignoreImageDefinedVolumes": false,
"netnsMountsUnderStateDir": false,
"enableUnprivilegedPorts": false,
"enableUnprivilegedICMP": false,
"enableCDI": false,
"cdiSpecDirs": [
"/etc/cdi",
"/var/run/cdi"
],
"imagePullProgressTimeout": "5m0s",
"drainExecSyncIOTimeout": "0s",
"imagePullWithSyncFs": false,
"ignoreDeprecationWarnings": null,
"containerdRootDir": "/var/lib/containerd",
"containerdEndpoint": "/run/containerd/containerd.sock",
"rootDir": "/var/lib/containerd/io.containerd.grpc.v1.cri",
"stateDir": "/run/containerd/io.containerd.grpc.v1.cri"
},
"golang": "go1.22.5",
"lastCNILoadStatus": "OK",
"lastCNILoadStatus.default": "OK"
}
Description
When I restart containerd, because k3s depends on containerd, systemd shuts down k3s as well. There are a few containers (csi-nfs-controller and snapshot-controller) that will fail if they lose connection to the K8 API, which they do when K3s restarts. Normally this would be fine, but because containerd is down at the time, something gets messed up such that I end up getting the
CreateContainerError: failed to reserve container nameerror every few seconds, which never resolves (I've left it running for 24 hours). I'm assuming something about a container failing while both k3s and containerd is offline causes a data issue. I had some of my own containers that would error out when I lost connection to k3s, but I've now patched those to catch the failure and retry every 3s instead, and since then those images no longer get stuck after the containerd restart.Steps to reproduce the issue
requires).systemctl restart containerdDescribe the results you received and expected
The containers that failed are initially
Unknown, then move intoCreateContainerErrorstate after a few minutes. The systemd logs are filled with lines like this:The container
d71d19e0ee9a4ccccd1e2ef3b47140bfe716d584bb54ff96994803f9c2ea3e79shows inctr c list, but not incrictl ps. If I try to runctr rm, it just hangs indefinitely. Only way I've found to fix is to restart the host (this is a single-node cluster). I have to restart containerd every now and then to update it to the latest version, and I'd like to be able to do so without restarting the host.What version of containerd are you using?
1.7.20
Any other relevant information
I'm running K3s with containerd on NixOS using ZFS storage.
Restarting containerd is the simplest way to reproduce this issue, but I've also seen it occur if I saturate the network (e.g. with a large download) such that timeouts start happening. I see
context deadline exceedederrors in the logs in both cases, before it gets stuck with the reserved name errors. I would also appreciate being able to recover if a pod ends up saturating the network, instead of pods that fail during that being unrecoverable.I would appreciate any workarounds ideas, even if this can't be reproduced or fixed. I wasn't sure how to edit the boltdb manually or if that could be used to help fix this issue.
crictl info results:
uname:
Linux iolite 6.6.42 #1-NixOS SMP PREEMPT_DYNAMIC Thu Jul 25 07:50:58 UTC 2024 x86_64 GNU/LinuxShow configuration if it is related to CRI plugin.
version = 2
[plugins."io.containerd.grpc.v1.cri"]
[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/nix/store/lqs84pr7jyyacrd2hw862kr5rkw83dzc-full-cni/bin"
conf_dir = "/var/lib/rancher/k3s/agent/etc/cni/net.d/"
[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "zfs"
[plugins."io.containerd.grpc.v1.cri".registry]
[plugins."io.containerd.grpc.v1.cri".registry.configs."registry.oycs.com".auth]
password = ""
username = ""
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."iolite.local:5000"]
endpoint = ["http://localhost:5000"]