Uploaded image for project: 'Multicloud'
  1. Multicloud
  2. MULTICLOUD-660

onap-multicloud-multicloud-k8s-etcd not running

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Done
    • Icon: High High
    • Dublin Release
    • Dublin Release
    • None
    • ONAP install with OOM Master branch on Orange Master daily Ci chain (kubespray used to install K8S) and on ONAP OOM gating chains

       
      onap-multicloud-multicloud-k8s-etcd is one of the last non running POD on Master

      The issue is reproduced in different CI chains.

       

      the error description is

      Name: onap-multicloud-multicloud-k8s-etcd-0
      Namespace: onap
      Node: compute05-onap-master/10.253.0.14
      Start Time: Tue, 28 May 2019 04:06:25 +0000
      Labels: app=multicloud-k8s-etcd
      chart=etcd-4.0.0
      controller-revision-hash=onap-multicloud-multicloud-k8s-etcd-7dcd58495b
      heritage=Tiller
      release=onap-multicloud
      statefulset.kubernetes.io/pod-name=onap-multicloud-multicloud-k8s-etcd-0
      Annotations: <none>
      Status: Running
      IP: 10.233.112.11
      Controllers: <none>
      Containers:
      onap-multicloud-multicloud-k8s-etcd:
      Container ID: docker://88b29dd9c01d9d44fd748b5623a179d0ffcc69a5faf778624cf918a312b45be6
      Image: k8s.gcr.io/etcd-amd64:2.2.5
      Image ID: docker-pullable://k8s.gcr.io/etcd-amd64@sha256:ca6b1687b44f9b3a6c74e7139105b0911c3ac696001e5b7260917fcf6e84898d
      Ports: 2380/TCP, 2379/TCP
      Command:
      /bin/sh
      -ec
      HOSTNAME=$(hostname)

      1. store member id into PVC for later member replacement
        collect_member() {
        while ! etcdctl member list &>/dev/null; do sleep 1; done
        etcdctl member list | grep http://${HOSTNAME}.${SERVICE_NAME}:2380 | cut -d':' -f1 | cut -d'[' -f1 > /var/run/etcd/member_id
        exit 0
        }

      eps() {
      EPS=""
      for i in $(seq 0 $((${INITIAL_CLUSTER_SIZE} - 1))); do
      EPS="${EPS}${EPS:+,}http://${SET_NAME}-${i}.${SERVICE_NAME}:2379"
      done
      echo ${EPS}
      }

      member_hash() {
      etcdctl member list | grep http://${HOSTNAME}.${SERVICE_NAME}:2380 | cut -d':' -f1 | cut -d'[' -f1
      }

      1. we should wait for other pods to be up before trying to join
      2. otherwise we got "no such host" errors when trying to resolve other members
        for i in $(seq 0 $((${INITIAL_CLUSTER_SIZE} - 1))); do
        while true; do
        echo "Waiting for ${SET_NAME}-${i}.${SERVICE_NAME} to come up"
        ping W 1 -c 1 ${SET_NAME}${i}.${SERVICE_NAME} > /dev/null && break
        sleep 1s
        done
        done
      1. re-joining after failure?
        if [ -e /var/run/etcd/default.etcd ]; then
        echo "Re-joining etcd member"
        member_id=$(cat /var/run/etcd/member_id)
      1. re-join member
        ETCDCTL_ENDPOINT=$(eps) etcdctl member update ${member_id} http://${HOSTNAME}.${SERVICE_NAME}:2380 | true
        exec etcd --name ${HOSTNAME} \
        --listen-peer-urls http://0.0.0.0:2380 \
        --listen-client-urls http://0.0.0.0:2379\
        --advertise-client-urls http://${HOSTNAME}.${SERVICE_NAME}:2379 \
        --data-dir /var/run/etcd/default.etcd
        fi
      1. etcd-SET_ID
        SET_ID=${HOSTNAME##*[^0-9]}
      1. adding a new member to existing cluster (assuming all initial pods are available)
        if [ "${SET_ID}" -ge ${INITIAL_CLUSTER_SIZE} ]; then
        export ETCDCTL_ENDPOINT=$(eps)
      1. member already added?
        MEMBER_HASH=$(member_hash)
        if [ -n "${MEMBER_HASH}" ]; then
      2. the member hash exists but for some reason etcd failed
      3. as the datadir has not be created, we can remove the member
      4. and retrieve new hash
        etcdctl member remove ${MEMBER_HASH}
        fi

      echo "Adding new member"
      etcdctl member add ${HOSTNAME} http://${HOSTNAME}.${SERVICE_NAME}:2380 | grep "^ETCD_" > /var/run/etcd/new_member_envs

      if [ $? -ne 0 ]; then
      echo "Exiting"
      rm -f /var/run/etcd/new_member_envs
      exit 1
      fi

      cat /var/run/etcd/new_member_envs
      source /var/run/etcd/new_member_envs

      collect_member &

      exec etcd --name ${HOSTNAME} \
      --listen-peer-urls http://0.0.0.0:2380 \
      --listen-client-urls http://0.0.0.0:2379 \
      --advertise-client-urls http://${HOSTNAME}.${SERVICE_NAME}:2379 \
      --data-dir /var/run/etcd/default.etcd \
      --initial-advertise-peer-urls http://${HOSTNAME}.${SERVICE_NAME}:2380 \
      --initial-cluster ${ETCD_INITIAL_CLUSTER} \
      --initial-cluster-state ${ETCD_INITIAL_CLUSTER_STATE}
      fi

      PEERS=""
      for i in $(seq 0 $((${INITIAL_CLUSTER_SIZE} - 1))); do
      PEERS="${PEERS}${PEERS:+,}${SET_NAME}${i}=http://${SET_NAME}${i}.${SERVICE_NAME}:2380"
      done

      collect_member &

      1. join member
        exec etcd --name ${HOSTNAME} \
        --initial-advertise-peer-urls http://${HOSTNAME}.${SERVICE_NAME}:2380 \
        --listen-peer-urls http://0.0.0.0:2380 \
        --listen-client-urls http://0.0.0.0:2379 \
        --advertise-client-urls http://${HOSTNAME}.${SERVICE_NAME}:2379 \
        --initial-cluster-token etcd-cluster-1 \
        --initial-cluster ${PEERS} \
        --initial-cluster-state new \
        --data-dir /var/run/etcd/default.etcd

      State: Waiting
      Reason: CrashLoopBackOff
      Last State: Terminated
      Reason: Error
      Exit Code: 1
      Started: Tue, 28 May 2019 04:38:45 +0000
      Finished: Tue, 28 May 2019 04:38:48 +0000
      Ready: False
      Restart Count: 10
      Liveness: exec [/bin/sh -c etcdctl cluster-health | grep -w healthy] delay=0s timeout=1s period=10s #success=1 #failure=3
      Environment:
      INITIAL_CLUSTER_SIZE: 1
      SET_NAME: onap-multicloud-multicloud-k8s-etcd
      SERVICE_NAME: multicloud-k8s-etcd
      Mounts:
      /var/run/etcd from onap-multicloud-multicloud-k8s-etcd-data (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from default-token-g9xlp (ro)
      Conditions:
      Type Status
      Initialized True
      Ready False
      ContainersReady False
      PodScheduled True
      Volumes:
      onap-multicloud-multicloud-k8s-etcd-data:
      Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
      ClaimName: onap-multicloud-multicloud-k8s-etcd-data-onap-multicloud-multicloud-k8s-etcd-0
      ReadOnly: false
      default-token-g9xlp:
      Type: Secret (a volume populated by a Secret)
      SecretName: default-token-g9xlp
      Optional: false
      QoS Class: BestEffort
      Node-Selectors: <none>
      Tolerations: node.kubernetes.io/not-ready=:Exists:NoExecute for 300s
      node.kubernetes.io/unreachable=:Exists:NoExecute for 300s
      Events:
      FirstSeen LastSeen Count From SubObjectPath Type Reason Message
      --------- -------- ----- ---- ------------- -------- ------ -------
      37m 37m 1 default-scheduler Warning FailedScheduling pod has unbound immediate PersistentVolumeClaims (repeated 12 times)
      37m 37m 1 default-scheduler Normal Scheduled Successfully assigned onap/onap-multicloud-multicloud-k8s-etcd-0 to compute05-onap-master
      36m 36m 1 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Warning Unhealthy Liveness probe errored: rpc error: code = Unknown desc = container not running (bddc85d8ccaf550a63e6c12a1d66d92989b246336555127559e00a5b34001628)
      33m 33m 1 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Warning Unhealthy Liveness probe errored: rpc error: code = Unknown desc = container not running (eb1b6f6f93e689bb4821cf2c28b23fd907f941847a607df50c1ded2b7a9438e1)
      33m 33m 1 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Warning Unhealthy Liveness probe failed: Error: client: etcd cluster is unavailable or misconfigured
      error #0: dial tcp 127.0.0.1:4001: getsockopt: connection refused
      error #1: dial tcp 127.0.0.1:2379: getsockopt: connection refused

      33m 32m 2 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Warning Unhealthy Liveness probe failed: Error: client: etcd cluster is unavailable or misconfigured
      error #0: dial tcp 127.0.0.1:2379: getsockopt: connection refused
      error #1: dial tcp 127.0.0.1:4001: getsockopt: connection refused

      32m 32m 1 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Normal Killing Killing container with id docker://onap-multicloud-multicloud-k8s-etcd:Container failed liveness probe.. Container will be killed and recreated.
      36m 32m 4 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Normal Pulled Successfully pulled image "k8s.gcr.io/etcd-amd64:2.2.5"
      36m 32m 4 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Normal Created Created container
      36m 32m 4 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Normal Started Started container
      37m 27m 7 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Normal Pulling pulling image "k8s.gcr.io/etcd-amd64:2.2.5"
      33m 2m 134 kubelet, compute05-onap-master spec.containers{onap-multicloud-multicloud-k8s-etcd} Warning BackOff Back-off restarting failed container

       

      and the docker error logs

      Waiting for onap-multicloud-multicloud-k8s-etcd-0.multicloud-k8s-etcd to come up
      ping: bad address 'onap-multicloud-multicloud-k8s-etcd-0.multicloud-k8s-etcd'
      Waiting for onap-multicloud-multicloud-k8s-etcd-0.multicloud-k8s-etcd to come up
      Re-joining etcd member
      cat: can't open '/var/run/etcd/member_id': No such file or directory

       

       

      {{}}

            kirankamineni kirankamineni
            mrichomme mrichomme
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

              Created:
              Updated:
              Resolved: