Skip to content

containers entering and leaving UNKNOWN state constantly #8647

@kennyqn

Description

@kennyqn

Description

On an idle system, after a few days, containers on that system have begun to enter an UNKNOWN state and then shortly after they are no longer UNKNOWN but other new containers enter the UNKNOWN state.

# ctr -n k8s.io tasks ls | grep -v RUNNING
TASK                                                                PID        STATUS
286cd84ed1b77e1c38a978a2dfe9d27dfcce17000d258cc87ce08a534801d4c0    2991141    STOPPED
bdef4e5a7997d72424ae03533b0c3c79b8984e6f484c612a4765efa3aaa87d80    2989001    STOPPED
039615b58a308b8cd4195e0eaafda39a8cf5d85ed2c1e19ac9867c3cac760f78    0          UNKNOWN
4353acaea26effba5f2bbad7f57ef21bcb1f94662a5a78470ce37660d23002c2    0          UNKNOWN
5f1b35b6bc7e8db1590662d14ef7420897376cf437d556ee8d1dfc842afa0708    0          UNKNOWN
975572dc225e2d91a7ef431d0ed8b2cc1292926196527d5c3ecc8b507e1623fd    0          UNKNOWN
dec4206c25b8235a3664a9d775635db47016857b48c5e9e0a88ea1a78bea7ffc    3159018    STOPPED
# date
Thu Jun  1 21:01:08 UTC 2023
# ctr -n k8s.io tasks ls | grep -v RUNNING
TASK                                                                PID        STATUS
286cd84ed1b77e1c38a978a2dfe9d27dfcce17000d258cc87ce08a534801d4c0    2991141    STOPPED
bdef4e5a7997d72424ae03533b0c3c79b8984e6f484c612a4765efa3aaa87d80    2989001    STOPPED
4b7a347d5c83eff637d51e224f62e05b24e09c9d3e73415762854e6750827b41    0          UNKNOWN
ba981aa75ced9c837a0a1c953f672fd3240cf54fdf1027aee1ea6c430699b037    0          UNKNOWN
192d6ea5027838b273238098ed858ac0ecc63f85a9aeeea828c05012dd318086    0          UNKNOWN
3c2cde85999b2d3999b50cae49b67e417424930a81c23f49401333960df7652f    0          UNKNOWN
a2d1de04a774ac3bfcf9428bd14ffd03e47f141a129c27d9ea56336e77549590    0          UNKNOWN
162a5fda5a185de8d030dae8907266a80ebaa43dc7071c91867d9d8fc17c67ac    0          UNKNOWN
dec4206c25b8235a3664a9d775635db47016857b48c5e9e0a88ea1a78bea7ffc    3159018    STOPPED
# date
Thu Jun  1 21:03:10 UTC 2023

Aside from that, I also checked top for CPU usage and for zombie processes. CPU usage does spike but zombie processes are not accumulating, but rather it moves between 16-19 zombie processes at one time

# ps aux | grep 'Z'
USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root      130544  0.0  0.0      0     0 ?        Z    19:21   0:00 [etcdctl] <defunct>
root      174464  0.0  0.0      0     0 ?        Z    20:06   0:01 [etcdctl] <defunct>
root      223819  0.0  0.0   8156   720 pts/0    S+   20:47   0:00 grep --color=auto Z
root     3786872  0.0  0.0      0     0 ?        Z    12:21   0:00 [etcdctl] <defunct>
root     3818812  0.0  0.0      0     0 ?        Z    12:47   0:01 [etcdctl] <defunct>
root     3831673  0.0  0.0      0     0 ?        Z    12:56   0:00 [etcdctl] <defunct>
root     3834429  0.0  0.0      0     0 ?        Z    12:59   0:01 [etcdctl] <defunct>
root     3836379  0.0  0.0      0     0 ?        Z    13:01   0:00 [etcdctl] <defunct>
root     3872437  0.0  0.0      0     0 ?        Z    13:29   0:00 [etcdctl] <defunct>
root     3888246  0.0  0.0      0     0 ?        Z    13:41   0:00 [etcdctl] <defunct>
root     3913216  0.0  0.0      0     0 ?        Z    14:02   0:00 [etcdctl] <defunct>
root     3958962  0.0  0.0      0     0 ?        Z    14:37   0:00 [etcdctl] <defunct>
root     3964062  0.0  0.0      0     0 ?        Z    14:41   0:01 [etcdctl] <defunct>
root     3980945  0.0  0.0      0     0 ?        Z    14:54   0:01 [etcdctl] <defunct>
root     3994609  0.0  0.0      0     0 ?        Z    15:07   0:00 [etcdctl] <defunct>
root     3995238  0.0  0.0      0     0 ?        Z    15:08   0:01 [etcdctl] <defunct>
root     3995799  0.0  0.0      0     0 ?        Z    15:09   0:01 [etcdctl] <defunct>
root     3998548  0.0  0.0      0     0 ?        Z    15:11   0:00 [etcdctl] <defunct>
# pstree -p -s 3964062
systemd(1)───containerd-shim(464269)───etcd(3781143)───etcdctl(3964062)

the zombie processes seem to stem from a containerd-shim process.

Steps to reproduce the issue

N/A

Describe the results you received and expected

idle system that was installed successfully/healthy went into an unstable state after a few days.

What version of containerd are you using?

1.6.20

Any other relevant information

runc version 1.1.5

crictl info

{
  "status": {
    "conditions": [
      {
        "type": "RuntimeReady",
        "status": true,
        "reason": "",
        "message": ""
      },
      {
        "type": "NetworkReady",
        "status": true,
        "reason": "",
        "message": ""
      }
    ]
  },
  "cniconfig": {
    "PluginDirs": [
      "/opt/cni/bin"
    ],
    "PluginConfDir": "/etc/cni/net.d",
    "PluginMaxConfNum": 1,
    "Prefix": "eth",
    "Networks": [
      {
        "Config": {
          "Name": "cni-loopback",
          "CNIVersion": "0.3.1",
          "Plugins": [
            {
              "Network": {
                "type": "loopback",
                "ipam": {},
                "dns": {}
              },
              "Source": "{\"type\":\"loopback\"}"
            }
          ],
          "Source": "{\n\"cniVersion\": \"0.3.1\",\n\"name\": \"cni-loopback\",\n\"plugins\": [{\n  \"type\": \"loopback\"\n}]\n}"
        },
        "IFName": "lo"
      },
      {
        "Config": {
          "Name": "k8s-pod-network",
          "CNIVersion": "0.3.1",
          "Plugins": [
            {
              "Network": {
                "type": "calico",
                "ipam": {
                  "type": "calico-ipam"
                },
                "dns": {}
              },
              "Source": "{\"datastore_type\":\"kubernetes\",\"ipam\":{\"type\":\"calico-ipam\"},\"kubernetes\":{\"kubeconfig\":\"/etc/cni/net.d/calico-kubeconfig\"},\"log_level\":\"info\",\"mtu\":1440,\"nodename\":\"apicdev3176\",\"policy\":{\"type\":\"k8s\"},\"type\":\"calico\"}"
            },
            {
              "Network": {
                "type": "portmap",
                "capabilities": {
                  "portMappings": true
                },
                "ipam": {},
                "dns": {}
              },
              "Source": "{\"capabilities\":{\"portMappings\":true},\"snat\":true,\"type\":\"portmap\"}"
            },
            {
              "Network": {
                "type": "bandwidth",
                "capabilities": {
                  "bandwidth": true
                },
                "ipam": {},
                "dns": {}
              },
              "Source": "{\"capabilities\":{\"bandwidth\":true},\"type\":\"bandwidth\"}"
            }
          ],
          "Source": "{\n  \"name\": \"k8s-pod-network\",\n  \"cniVersion\": \"0.3.1\",\n  \"plugins\": [\n    {\n      \"type\": \"calico\",\n      \"log_level\": \"info\",\n      \"datastore_type\": \"kubernetes\",\n      \"nodename\": \"apicdev3176\",\n      \"mtu\": 1440,\n      \"ipam\": {\n          \"type\": \"calico-ipam\"\n      },\n      \"policy\": {\n          \"type\": \"k8s\"\n      },\n      \"kubernetes\": {\n          \"kubeconfig\": \"/etc/cni/net.d/calico-kubeconfig\"\n      }\n    },\n    {\n      \"type\": \"portmap\",\n      \"snat\": true,\n      \"capabilities\": {\"portMappings\": true}\n    },\n    {\n      \"type\": \"bandwidth\",\n      \"capabilities\": {\"bandwidth\": true}\n    }\n  ]\n}\n"
        },
        "IFName": "eth0"
      }
    ]
  },
  "config": {
    "containerd": {
      "snapshotter": "overlayfs",
      "defaultRuntimeName": "runc",
      "defaultRuntime": {
        "runtimeType": "",
        "runtimePath": "",
        "runtimeEngine": "",
        "PodAnnotations": [],
        "ContainerAnnotations": [],
        "runtimeRoot": "",
        "options": {},
        "privileged_without_host_devices": false,
        "baseRuntimeSpec": "",
        "cniConfDir": "",
        "cniMaxConfNum": 0
      },
      "untrustedWorkloadRuntime": {
        "runtimeType": "",
        "runtimePath": "",
        "runtimeEngine": "",
        "PodAnnotations": [],
        "ContainerAnnotations": [],
        "runtimeRoot": "",
        "options": {},
        "privileged_without_host_devices": false,
        "baseRuntimeSpec": "",
        "cniConfDir": "",
        "cniMaxConfNum": 0
      },
      "runtimes": {
        "runc": {
          "runtimeType": "io.containerd.runc.v2",
          "runtimePath": "",
          "runtimeEngine": "",
          "PodAnnotations": [],
          "ContainerAnnotations": [],
          "runtimeRoot": "",
          "options": {
            "BinaryName": "",
            "CriuImagePath": "",
            "CriuPath": "",
            "CriuWorkPath": "",
            "IoGid": 0,
            "IoUid": 0,
            "NoNewKeyring": false,
            "NoPivotRoot": false,
            "Root": "",
            "ShimCgroup": "",
            "SystemdCgroup": false
          },
          "privileged_without_host_devices": false,
          "baseRuntimeSpec": "",
          "cniConfDir": "",
          "cniMaxConfNum": 0
        }
      },
      "noPivot": false,
      "disableSnapshotAnnotations": true,
      "discardUnpackedLayers": false,
      "ignoreRdtNotEnabledErrors": false
    },
    "cni": {
      "binDir": "/opt/cni/bin",
      "confDir": "/etc/cni/net.d",
      "maxConfNum": 1,
      "confTemplate": "",
      "ipPref": ""
    },
    "registry": {
      "configPath": "",
      "mirrors": {},
      "configs": {},
      "auths": {},
      "headers": {}
    },
    "imageDecryption": {
      "keyModel": "node"
    },
    "disableTCPService": true,
    "streamServerAddress": "127.0.0.1",
    "streamServerPort": "0",
    "streamIdleTimeout": "4h0m0s",
    "enableSelinux": false,
    "selinuxCategoryRange": 1024,
    "sandboxImage": "registry.k8s.io/pause:3.6",
    "statsCollectPeriod": 10,
    "systemdCgroup": false,
    "enableTLSStreaming": false,
    "x509KeyPairStreaming": {
      "tlsCertFile": "",
      "tlsKeyFile": ""
    },
    "maxContainerLogSize": 16384,
    "disableCgroup": false,
    "disableApparmor": false,
    "restrictOOMScoreAdj": false,
    "maxConcurrentDownloads": 3,
    "disableProcMount": false,
    "unsetSeccompProfile": "",
    "tolerateMissingHugetlbController": true,
    "disableHugetlbController": true,
    "device_ownership_from_security_context": false,
    "ignoreImageDefinedVolumes": false,
    "netnsMountsUnderStateDir": false,
    "enableUnprivilegedPorts": false,
    "enableUnprivilegedICMP": false,
    "containerdRootDir": "/var/lib/containerd",
    "containerdEndpoint": "/run/containerd/containerd.sock",
    "rootDir": "/var/lib/containerd/io.containerd.grpc.v1.cri",
    "stateDir": "/run/containerd/io.containerd.grpc.v1.cri"
  },
  "golang": "go1.19.7",
  "lastCNILoadStatus": "OK",
  "lastCNILoadStatus.default": "OK"
}

uname -a

Linux apicdev3176 5.4.0-132-generic #148-Ubuntu SMP Mon Oct 17 16:02:06 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux

Show configuration if it is related to CRI plugin.

disabled_plugins = []
imports = []
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/containerd"
state = "/run/containerd"
temp = ""
version = 2

[cgroup]
  path = ""

[debug]
  address = ""
  format = ""
  gid = 0
  level = ""
  uid = 0

[grpc]
  address = "/run/containerd/containerd.sock"
  gid = 0
  max_recv_message_size = 16777216
  max_send_message_size = 16777216
  tcp_address = ""
  tcp_tls_ca = ""
  tcp_tls_cert = ""
  tcp_tls_key = ""
  uid = 0

[metrics]
  address = ""
  grpc_histogram = false

[plugins]

  [plugins."io.containerd.gc.v1.scheduler"]
    deletion_threshold = 0
    mutation_threshold = 100
    pause_threshold = 0.02
    schedule_delay = "0s"
    startup_delay = "100ms"

  [plugins."io.containerd.grpc.v1.cri"]
    device_ownership_from_security_context = false
    disable_apparmor = false
    disable_cgroup = false
    disable_hugetlb_controller = true
    disable_proc_mount = false
    disable_tcp_service = true
    enable_selinux = false
    enable_tls_streaming = false
    enable_unprivileged_icmp = false
    enable_unprivileged_ports = false
    ignore_image_defined_volumes = false
    max_concurrent_downloads = 3
    max_container_log_line_size = 16384
    netns_mounts_under_state_dir = false
    restrict_oom_score_adj = false
    sandbox_image = "registry.k8s.io/pause:3.6"
    selinux_category_range = 1024
    stats_collect_period = 10
    stream_idle_timeout = "4h0m0s"
    stream_server_address = "127.0.0.1"
    stream_server_port = "0"
    systemd_cgroup = false
    tolerate_missing_hugetlb_controller = true
    unset_seccomp_profile = ""

    [plugins."io.containerd.grpc.v1.cri".cni]
      bin_dir = "/opt/cni/bin"
      conf_dir = "/etc/cni/net.d"
      conf_template = ""
      ip_pref = ""
      max_conf_num = 1

    [plugins."io.containerd.grpc.v1.cri".containerd]
      default_runtime_name = "runc"
      disable_snapshot_annotations = true
      discard_unpacked_layers = false
      ignore_rdt_not_enabled_errors = false
      no_pivot = false
      snapshotter = "overlayfs"

      [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]

      [plugins."io.containerd.grpc.v1.cri".containerd.runtimes]

        [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
          base_runtime_spec = ""
          cni_conf_dir = ""
          cni_max_conf_num = 0
          container_annotations = []
          pod_annotations = []
          privileged_without_host_devices = false
          runtime_engine = ""
          runtime_path = ""
          runtime_root = ""
          runtime_type = "io.containerd.runc.v2"

          [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
            BinaryName = ""
            CriuImagePath = ""
            CriuPath = ""
            CriuWorkPath = ""
            IoGid = 0
            IoUid = 0
            NoNewKeyring = false
            NoPivotRoot = false
            Root = ""
            ShimCgroup = ""
            SystemdCgroup = false

      [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
        base_runtime_spec = ""
        cni_conf_dir = ""
        cni_max_conf_num = 0
        container_annotations = []
        pod_annotations = []
        privileged_without_host_devices = false
        runtime_engine = ""
        runtime_path = ""
        runtime_root = ""
        runtime_type = ""

        [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]

    [plugins."io.containerd.grpc.v1.cri".image_decryption]
      key_model = "node"

    [plugins."io.containerd.grpc.v1.cri".registry]
      config_path = ""

      [plugins."io.containerd.grpc.v1.cri".registry.auths]

      [plugins."io.containerd.grpc.v1.cri".registry.configs]

      [plugins."io.containerd.grpc.v1.cri".registry.headers]

      [plugins."io.containerd.grpc.v1.cri".registry.mirrors]

    [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
      tls_cert_file = ""
      tls_key_file = ""

  [plugins."io.containerd.internal.v1.opt"]
    path = "/opt/containerd"

  [plugins."io.containerd.internal.v1.restart"]
    interval = "10s"

  [plugins."io.containerd.internal.v1.tracing"]
    sampling_ratio = 1.0
    service_name = "containerd"

  [plugins."io.containerd.metadata.v1.bolt"]
    content_sharing_policy = "shared"

  [plugins."io.containerd.monitor.v1.cgroups"]
    no_prometheus = false

  [plugins."io.containerd.runtime.v1.linux"]
    no_shim = false
    runtime = "runc"
    runtime_root = ""
    shim = "containerd-shim"
    shim_debug = false

  [plugins."io.containerd.runtime.v2.task"]
    platforms = ["linux/amd64"]
    sched_core = false

  [plugins."io.containerd.service.v1.diff-service"]
    default = ["walking"]

  [plugins."io.containerd.service.v1.tasks-service"]
    rdt_config_file = ""

  [plugins."io.containerd.snapshotter.v1.aufs"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.btrfs"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.devmapper"]
    async_remove = false
    base_image_size = ""
    discard_blocks = false
    fs_options = ""
    fs_type = ""
    pool_name = ""
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.native"]
    root_path = ""

  [plugins."io.containerd.snapshotter.v1.overlayfs"]
    root_path = ""
    upperdir_label = false

  [plugins."io.containerd.snapshotter.v1.zfs"]
    root_path = ""

  [plugins."io.containerd.tracing.processor.v1.otlp"]
    endpoint = ""
    insecure = false
    protocol = ""

[proxy_plugins]

[stream_processors]

  [stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
    accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
    args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
    env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
    path = "ctd-decoder"
    returns = "application/vnd.oci.image.layer.v1.tar"

  [stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
    accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
    args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
    env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
    path = "ctd-decoder"
    returns = "application/vnd.oci.image.layer.v1.tar+gzip"

[timeouts]
  "io.containerd.timeout.bolt.open" = "0s"
  "io.containerd.timeout.shim.cleanup" = "5s"
  "io.containerd.timeout.shim.load" = "5s"
  "io.containerd.timeout.shim.shutdown" = "3s"
  "io.containerd.timeout.task.state" = "2s"

[ttrpc]
  address = ""
  gid = 0
  uid = 0

Metadata

Metadata

Assignees

No one assigned

    Type

    Projects

    Status

    Todo

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions