Skip to content

shim process leaked #9309

@johannesfrey

Description

@johannesfrey

Description

When a host is experiencing high I/O load there are cases that containerd-shim processes remain on the host even though the underlying processes have already vanished.
Even though, there have already been multiple reports of the same symptom, I went ahead and created another issue because it was requested here and in the hope to have this as the single remaing tracking issue?! Possible duplicates:

@fuweid did already create PR #8954 to address the initial issue. Nevertheless, I am still able to reproduce the issue with containerd in all versions ranging from 1.7.4 up to 1.7.8.
The issue appears in the wild on some of our Kubernetes clusters running on containerd v1.7.5, which have either high I/O load or a slow backing storage (which is mimiced by step 2 in the below Steps to reproduce the issue section).
There is also a relevant issue that describes ideas to make the task.Delete API retriable: #8981

Logs from containerd when the problem arises:

Oct 30 12:25:48 myhost containerd[15506]: time="2023-10-30T12:25:48.226969594Z" level=error msg="failed to handle container TaskExit event container_id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" pid:16710 exit_status:137 exited_at:{seconds:1698668738 nanos:225573675}" error="failed to stop container: context deadline exceeded: unknown"
Oct 30 12:25:49 myhost containerd[15506]: time="2023-10-30T12:25:49.291791891Z" level=error msg="collecting metrics for 5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de" error="cgroups: cgroup deleted"
Oct 30 12:25:49 myhost containerd[15506]: time="2023-10-30T12:25:49.302085250Z" level=error msg="collecting metrics for 5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de" error="cgroups: cgroup deleted"
Oct 30 12:25:49 myhost containerd[15506]: time="2023-10-30T12:25:49.395805456Z" level=info msg="TaskExit event container_id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" pid:16710 exit_status:137 exited_at:{seconds:1698668738 nanos:225573675}"
Oct 30 12:25:51 myhost containerd[15506]: time="2023-10-30T12:25:51.396618687Z" level=error msg="get state for 5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de" error="context deadline exceeded: unknown"
Oct 30 12:25:51 myhost containerd[15506]: time="2023-10-30T12:25:51.396695890Z" level=warning msg="unknown status" status=0
Oct 30 12:25:55 myhost containerd[15506]: time="2023-10-30T12:25:55.551329729Z" level=error msg="collecting metrics for 5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de" error="cgroups: cgroup deleted"
Oct 30 12:25:59 myhost containerd[15506]: time="2023-10-30T12:25:59.396058679Z" level=error msg="Failed to handle backOff event container_id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" pid:16710 exit_status:137 exited_at:{seconds:1698668738 nanos:225573675} for 5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de" error="failed to handle container TaskExit event: failed to stop container: context deadline exceeded: unknown"
Oct 30 12:26:02 myhost containerd[15506]: time="2023-10-30T12:26:02.289122846Z" level=error msg="ttrpc: received message on inactive stream" stream=23
Oct 30 12:26:02 myhost containerd[15506]: time="2023-10-30T12:26:02.290169641Z" level=error msg="ttrpc: received message on inactive stream" stream=25
Oct 30 12:26:02 myhost containerd[15506]: time="2023-10-30T12:26:02.290647016Z" level=error msg="ttrpc: received message on inactive stream" stream=27
Oct 30 12:26:02 myhost containerd[15506]: time="2023-10-30T12:26:02.290988137Z" level=error msg="ttrpc: received message on inactive stream" stream=21
Oct 30 12:26:02 myhost containerd[15506]: time="2023-10-30T12:26:02.395505466Z" level=info msg="TaskExit event container_id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" pid:16710 exit_status:137 exited_at:{seconds:1698668738 nanos:225573675}"
Oct 30 12:26:02 myhost containerd[15506]: time="2023-10-30T12:26:02.397432721Z" level=error msg="Failed to handle backOff event container_id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" id:\"5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de\" pid:16710 exit_status:137 exited_at:{seconds:1698668738 nanos:225573675} for 5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de" error="failed to handle container TaskExit event: failed to cleanup container 5869659f160839a4d2001bf99a46e03d0ec1b3d97322b4d8b0b51befe76039de in task-service: container must be created: failed precondition"

Steps to reproduce the issue

Shameless plug of #7496 (comment)

  1. kubectl run nginx --image=nginx --restart=Never
  2. sudo strace -p $(pidof target-shim) --trace=umount2 -f --detach-on=execve -e inject=umount2:delay_enter=12s
  3. kill -9 $(pidof nginx)

Describe the results you received and expected

That the containerd-shim processes are removed as well.

What version of containerd are you using?

containerd github.com/containerd/containerd v1.7.5 fe457eb

Any other relevant information

runc --version

runc version 1.1.9
commit: v1.1.9-0-gccaecfcb
spec: 1.0.2-dev
go: go1.20.7
libseccomp: 2.5.1

crictl info

{
  "status": {
    "conditions": [
      {
        "type": "RuntimeReady",
        "status": true,
        "reason": "",
        "message": ""
      },
      {
        "type": "NetworkReady",
        "status": true,
        "reason": "",
        "message": ""
      }
    ]
  },
  "cniconfig": {
    "PluginDirs": [
      "/opt/cni/bin"
    ],
    "PluginConfDir": "/etc/cni/net.d",
    "PluginMaxConfNum": 1,
    "Prefix": "eth",
    "Networks": [
      {
        "Config": {
          "Name": "cni-loopback",
          "CNIVersion": "0.3.1",
          "Plugins": [
            {
              "Network": {
                "type": "loopback",
                "ipam": {},
                "dns": {}
              },
              "Source": "{\"type\":\"loopback\"}"
            }
          ],
          "Source": "{\n\"cniVersion\": \"0.3.1\",\n\"name\": \"cni-loopback\",\n\"plugins\": [{\n  \"type\": \"loopback\"\n}]\n}"
        },
        "IFName": "lo"
      },
      {
        "Config": {
          "Name": "k8s-pod-network",
          "CNIVersion": "0.3.1",
          "Plugins": [
            {
              "Network": {
                "type": "calico",
                "ipam": {
                  "type": "host-local"
                },
                "dns": {}
              },
              "Source": "{\"datastore_type\":\"kubernetes\",\"ipam\":{\"ranges\":[[{\"subnet\":\"usePodCidr\"}]],\"routes\":[{\"dst\":\"0.0.0.0/0\"}],\"type\":\"host-local\"},\"kubernetes\":{\"kubeconfig\":\"/etc/cni/net.d/calico-kubeconfig\"},\"log_level\":\"error\",\"mtu\":1500,\"nodename\":\"myhost\",\"policy\":{\"type\":\"k8s\"},\"type\":\"calico\"}"
            },
            {
              "Network": {
                "type": "portmap",
                "capabilities": {
                  "portMappings": true
                },
                "ipam": {},
                "dns": {}
              },
              "Source": "{\"capabilities\":{\"portMappings\":true},\"type\":\"portmap\"}"
            }
          ],
          "Source": "{\n  \"name\": \"k8s-pod-network\",\n  \"cniVersion\": \"0.3.1\",\n  \"plugins\": [\n    {\n      \"type\": \"calico\",\n      \"log_level\": \"error\",\n      \"datastore_type\": \"kubernetes\",\n      \"nodename\": \"myhost\",\n      \"mtu\": 1500,\n      \"ipam\": {\n          \"type\": \"host-local\",\n          \"ranges\": [\n            [\n              {\n                \"subnet\": \"usePodCidr\"\n              }\n            ]\n          ],\n          \"routes\": [\n            {\n              \"dst\": \"0.0.0.0/0\"\n            }\n          ]\n      },\n      \"policy\": {\n          \"type\": \"k8s\"\n      },\n      \"kubernetes\": {\n          \"kubeconfig\": \"/etc/cni/net.d/calico-kubeconfig\"\n      }\n    },\n    {\n      \"type\": \"portmap\",\n      \"capabilities\": {\"portMappings\": true}\n    }\n  ]\n}"
        },
        "IFName": "eth0"
      }
    ]
  },
  "config": {
    "containerd": {
      "snapshotter": "overlayfs",
      "defaultRuntimeName": "default",
      "defaultRuntime": {
        "runtimeType": "io.containerd.runtime.v1.linux",
        "runtimePath": "",
        "runtimeEngine": "",
        "PodAnnotations": null,
        "ContainerAnnotations": null,
        "runtimeRoot": "",
        "options": null,
        "privileged_without_host_devices": false,
        "privileged_without_host_devices_all_devices_allowed": false,
        "baseRuntimeSpec": "",
        "cniConfDir": "",
        "cniMaxConfNum": 0,
        "snapshotter": "",
        "sandboxMode": ""
      },
      "untrustedWorkloadRuntime": {
        "runtimeType": "",
        "runtimePath": "",
        "runtimeEngine": "",
        "PodAnnotations": null,
        "ContainerAnnotations": null,
        "runtimeRoot": "",
        "options": null,
        "privileged_without_host_devices": false,
        "privileged_without_host_devices_all_devices_allowed": false,
        "baseRuntimeSpec": "",
        "cniConfDir": "",
        "cniMaxConfNum": 0,
        "snapshotter": "",
        "sandboxMode": ""
      },
      "runtimes": {
        "default": {
          "runtimeType": "io.containerd.runtime.v1.linux",
          "runtimePath": "",
          "runtimeEngine": "",
          "PodAnnotations": null,
          "ContainerAnnotations": null,
          "runtimeRoot": "",
          "options": null,
          "privileged_without_host_devices": false,
          "privileged_without_host_devices_all_devices_allowed": false,
          "baseRuntimeSpec": "",
          "cniConfDir": "",
          "cniMaxConfNum": 0,
          "snapshotter": "",
          "sandboxMode": "podsandbox"
        },
        "runc": {
          "runtimeType": "io.containerd.runc.v2",
          "runtimePath": "",
          "runtimeEngine": "",
          "PodAnnotations": null,
          "ContainerAnnotations": null,
          "runtimeRoot": "",
          "options": {
            "BinaryName": "",
            "CriuImagePath": "",
            "CriuPath": "",
            "CriuWorkPath": "",
            "IoGid": 0,
            "IoUid": 0,
            "NoNewKeyring": false,
            "NoPivotRoot": false,
            "Root": "",
            "ShimCgroup": "",
            "SystemdCgroup": false
          },
          "privileged_without_host_devices": false,
          "privileged_without_host_devices_all_devices_allowed": false,
          "baseRuntimeSpec": "",
          "cniConfDir": "",
          "cniMaxConfNum": 0,
          "snapshotter": "",
          "sandboxMode": "podsandbox"
        }
      },
      "noPivot": false,
      "disableSnapshotAnnotations": true,
      "discardUnpackedLayers": false,
      "ignoreBlockIONotEnabledErrors": false,
      "ignoreRdtNotEnabledErrors": false
    },
    "cni": {
      "binDir": "/opt/cni/bin",
      "confDir": "/etc/cni/net.d",
      "maxConfNum": 1,
      "setupSerially": false,
      "confTemplate": "",
      "ipPref": ""
    },
    "registry": {
      "configPath": "",
      "mirrors": null,
      "configs": null,
      "auths": null,
      "headers": null
    },
    "imageDecryption": {
      "keyModel": "node"
    },
    "disableTCPService": true,
    "streamServerAddress": "127.0.0.1",
    "streamServerPort": "10260",
    "streamIdleTimeout": "4h0m0s",
    "enableSelinux": false,
    "selinuxCategoryRange": 1024,
    "sandboxImage": "registry-emea.app.corpintra.net/caas/pause:3.9",
    "statsCollectPeriod": 10,
    "systemdCgroup": true,
    "enableTLSStreaming": false,
    "x509KeyPairStreaming": {
      "tlsCertFile": "",
      "tlsKeyFile": ""
    },
    "maxContainerLogSize": 4194304,
    "disableCgroup": false,
    "disableApparmor": false,
    "restrictOOMScoreAdj": false,
    "maxConcurrentDownloads": 3,
    "disableProcMount": false,
    "unsetSeccompProfile": "",
    "tolerateMissingHugetlbController": true,
    "disableHugetlbController": true,
    "device_ownership_from_security_context": false,
    "ignoreImageDefinedVolumes": false,
    "netnsMountsUnderStateDir": false,
    "enableUnprivilegedPorts": false,
    "enableUnprivilegedICMP": false,
    "enableCDI": false,
    "cdiSpecDirs": [
      "/etc/cdi",
      "/var/run/cdi"
    ],
    "imagePullProgressTimeout": "1m0s",
    "drainExecSyncIOTimeout": "0s",
    "containerdRootDir": "/var/lib/containerd",
    "containerdEndpoint": "/run/containerd/containerd.sock",
    "rootDir": "/var/lib/containerd/io.containerd.grpc.v1.cri",
    "stateDir": "/run/containerd/io.containerd.grpc.v1.cri"
  },
  "golang": "go1.20.7",
  "lastCNILoadStatus": "OK",
  "lastCNILoadStatus.default": "OK"
}

uname -a

Linux myhost 5.15.0-82-generic #91~20.04.1-Ubuntu SMP Fri Aug 18 16:24:39 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux

Show configuration if it is related to CRI plugin.

root = "/var/lib/containerd"
state = "/run/containerd"
oom_score = 0

[grpc]
  address = "/run/containerd/containerd.sock"
  uid = 0
  gid = 0
  max_recv_message_size = 16777216
  max_send_message_size = 16777216

[debug]
  address = "/run/containerd/containerd-debug.sock"
  uid = 0
  gid = 0
  level = ""

[metrics]
  address = "0.0.0.0:1338"
  grpc_histogram = false

[cgroup]
  path = ""

[plugins]
  [plugins.cgroups]
    no_prometheus = false
  [plugins.cri]
    stream_server_port = "10260"
    stats_collect_period = 10
    systemd_cgroup = true
    enable_tls_streaming = false
    max_container_log_line_size = 4194304
    disable_proc_mount = false
    [plugins.cri.containerd]
      snapshotter = "overlayfs"
      no_pivot = false
      [plugins.cri.containerd.default_runtime]
        runtime_type = "io.containerd.runtime.v1.linux"
        runtime_engine = ""
        runtime_root = ""
      [plugins.cri.containerd.untrusted_workload_runtime]
        runtime_type = ""
        runtime_engine = ""
        runtime_root = ""
    [plugins.cri.cni]
      bin_dir = "/opt/cni/bin"
      conf_dir = "/etc/cni/net.d"
      conf_template = ""
    [plugins.cri.x509_key_pair_streaming]
      tls_cert_file = ""
      tls_key_file = ""
  [plugins.diff-service]
    default = ["walking"]
  [plugins.linux]
    shim = "containerd-shim"
    runtime = "runc"
    runtime_root = ""
    no_shim = false
    shim_debug = false
  [plugins.opt]
    path = "/opt/containerd"
  [plugins.restart]
    interval = "10s"
  [plugins.scheduler]
    pause_threshold = 0.02
    deletion_threshold = 0
    mutation_threshold = 100
    schedule_delay = "0s"
    startup_delay = "100ms"

Johannes Frey <[email protected]>, Mercedes-Benz Tech Innovation GmbH (Provider Information)

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions