Description
We're in the process of updating our service to use containerd 2 from 1.7.
2.0.0 works just fine, 2.0.1 and 2.0.2 does not. When deployed onto a worker, logging doesn't appear to be working (not /var/log/containerd.log file) and the systemctl status has the following
root@master-containerd200-e9004f0a8-1737068261:~# systemctl status containerd
● containerd.service - containerd container runtime
Loaded: loaded (/etc/systemd/system/containerd.service; enabled; preset: enabled)
Drop-In: /etc/systemd/system/containerd.service.d
└─containerd-cgroup-dropin.conf
Active: active (running) since Thu 2025-01-16 23:42:34 UTC; 15h ago
Docs: https://containerd.io
Main PID: 5484 (containerd)
Tasks: 0
Memory: 9.5M (peak: 10.6M)
CPU: 52ms
CGroup: /system.slice/containerd.service
‣ 5484 /usr/local/bin/containerd
Jan 16 23:42:34 master-containerd200-e9004f0a8-1737068261 systemd[1]: Starting containerd.service - containerd container runtime...
Jan 16 23:42:34 master-containerd200-e9004f0a8-1737068261 systemd[1]: Started containerd.service - containerd container runtime.
Journalctl logs don't appear to have been useful either. Once we reboot the containerd service on the machine, it comes up just fine and finishes deploying the rest of the pods stuck.
After building and testing various commits between 2.0.0 and 2.0.1, I was able to narrow it down to e9004f0 being the cause of our problems. My guess for how the problems occurs is
- containerd starts
- cni is still deploying/not fully provisioned yet
- containerd "locks"
- cni is done deploying
- containerd does not "unlock"
- (if you restart the containerd service) it starts up and works because the cni is now ready and running.
I have the ability to build branches and deploy to our environment, so I can test any fixes/PR's.
Steps to reproduce the issue
- Occurs in our deploying phase for kubernetes workers. For us its just deploying using containerd 2.0.1+.
- I have the ability to build branches and deploy to our environment, so I can test any fixes.
Describe the results you received and expected
What we received...
(.venv) root@instance-armada-ansible:/armada-ansible# k get pods -A
NAMESPACE NAME READY STATUS RESTARTS AGE
calico-system calico-kube-controllers-7dd6c8cf87-sf9zq 0/1 ContainerCreating 0 14h
calico-system calico-node-46qbq 0/1 Running 0 14h
calico-system calico-typha-55ccf88659-cc6w4 1/1 Running 0 14h
kube-system coredns-57c6c86588-9625g 0/1 ContainerCreating 0 14h
kube-system coredns-57c6c86588-t59ln 0/1 ContainerCreating 0 14h
kube-system coredns-57c6c86588-tcz8z 0/1 ContainerCreating 0 14h
kube-system coredns-autoscaler-6c5545c74b-ktj7w 1/1 Running 0 14h
kube-system dashboard-metrics-scraper-7544b655bb-vhnlp 0/1 ContainerCreating 0 14h
kube-system ibm-file-plugin-6c8bb96855-gr2g6 0/1 ContainerCreating 0 14h
kube-system ibm-file-plugin-6d499b57cc-z74qt 1/1 Running 0 14h
kube-system ibm-keepalived-watcher-s2fst 1/1 Running 0 14h
kube-system ibm-master-proxy-static-10.185.174.237 2/2 Running 0 14h
kube-system ibm-storage-watcher-584c7d8b97-hwjj9 1/1 Running 0 14h
kube-system ibmcloud-block-storage-driver-lxb7g 1/1 Running 0 14h
kube-system ibmcloud-block-storage-plugin-6d696fd8d6-f2nvg 1/1 Running 0 14h
kube-system konnectivity-agent-4mbmv 0/1 ContainerCreating 0 14h
kube-system kubernetes-dashboard-7bfc9d559c-2d9zx 0/1 ContainerCreating 0 14h
kube-system metrics-server-58d4684bb7-r5nc2 2/3 Running 0 14h
kube-system metrics-server-67564b8f67-lvnfx 0/3 ContainerCreating 0 14h
kube-system metrics-server-67564b8f67-spg94 0/3 ContainerCreating 0 14h
kube-system snapshot-controller-58c66fc494-88jfj 0/1 Terminating 0 14h
kube-system snapshot-controller-58c66fc494-p2twr 1/1 Running 0 14h
kube-system snapshot-controller-78998567d9-bz6xw 0/1 Pending 0 14h
kube-system snapshot-controller-78998567d9-vt5p8 1/1 Running 0 14h
tigera-operator tigera-operator-5fb656b7b-ccvsb 1/1 Running 0 14h
We expect all pods to be running.
What version of containerd are you using?
2.0.1
Any other relevant information
No response
Show configuration if it is related to CRI plugin.
version = 3
root = "/var/data/cripersistentstorage"
state = "/run/containerd"
oom_score = 0
[grpc]
address = "/run/containerd/containerd.sock"
uid = 0
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
[debug]
address = ""
uid = 0
gid = 0
level = ""
[metrics]
address = "10.185.174.237:10210"
grpc_histogram = false
[cgroup]
path = "/podruntime/runtime"
[plugins]
[plugins.'io.containerd.cri.v1.images']
image_pull_progress_timeout = "5m0s"
stats_collect_period = 10
snapshotter = "overlayfs"
disable_snapshot_annotations = true
discard_unpacked_layers = false
[plugins.'io.containerd.cri.v1.images'.registry]
config_path = "/etc/containerd/certs.d"
[plugins.'io.containerd.cri.v1.images'.pinned_images]
sandbox = "us.icr.io/armada-master/pause:3.10"
[plugins.'io.containerd.cri.v1.runtime']
enable_selinux = false
selinux_category_range = 1024
tolerate_missing_hugetlb_controller = true
ignore_image_defined_volumes = false
drain_exec_sync_io_timeout = "0s"
[plugins.'io.containerd.cri.v1.runtime'.containerd]
default_runtime_name = 'runc'
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes]
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
pod_annotations = []
container_annotations = []
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.runc.options]
BinaryName = ''
CriuImagePath = ''
CriuWorkPath = ''
IoGid = 0
IoUid = 0
NoNewKeyring = false
Root = ''
ShimCgroup = ''
[plugins."io.containerd.cri.v1.runtime".containerd.runtimes.untrusted]
runtime_type = "io.containerd.runc.v2"
pod_annotations = []
container_annotations = []
privileged_without_host_devices = false
[plugins.'io.containerd.grpc.v1.cri']
disable_tcp_service = true
stream_server_address = "127.0.0.1"
stream_server_port = "0"
stream_idle_timeout = "15m"
# IBM Cloud Container Registry has a 6 minute default timeout. Per [1],
# the registry squad recommended a shorter timeout for containerd client.
# [1] https://github.ibm.com/alchemy-containers/armada-update/issues/1023
enable_tls_streaming = false
systemd_cgroup = false
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = "/etc/containerd/certs.d"
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.gc.v1.scheduler"]
pause_threshold = 0.02
deletion_threshold = 0
mutation_threshold = 100
schedule_delay = "0s"
startup_delay = "100ms"
Description
We're in the process of updating our service to use containerd 2 from 1.7.
2.0.0 works just fine, 2.0.1 and 2.0.2 does not. When deployed onto a worker, logging doesn't appear to be working (not /var/log/containerd.log file) and the systemctl status has the following
Journalctl logs don't appear to have been useful either. Once we reboot the containerd service on the machine, it comes up just fine and finishes deploying the rest of the pods stuck.
After building and testing various commits between 2.0.0 and 2.0.1, I was able to narrow it down to e9004f0 being the cause of our problems. My guess for how the problems occurs is
I have the ability to build branches and deploy to our environment, so I can test any fixes/PR's.
Steps to reproduce the issue
Describe the results you received and expected
What we received...
We expect all pods to be running.
What version of containerd are you using?
2.0.1
Any other relevant information
No response
Show configuration if it is related to CRI plugin.