Fix bug 1654 to ensure that OSM is healthy after installation 52/11152/2
authorramonsalguer <javier.ramon@telefonica.com>
Thu, 16 Jul 2020 12:42:04 +0000 (14:42 +0200)
committergarciadeblas <gerardo.garciadeblas@telefonica.com>
Mon, 13 Sep 2021 08:37:50 +0000 (10:37 +0200)
This change updates health-check to determine more accurately when the system
is ready. All the deployments and statefulsets must have all its pods running
and stable during some time.

Change-Id: Ibe7ab30415ce81fd95caa0fdd66710746de7605b
Signed-off-by: ramonsalguer <javier.ramon@telefonica.com>
Signed-off-by: garciadeblas <gerardo.garciadeblas@telefonica.com>
installers/osm_health.sh

index e26e762..5ee3d78 100755 (executable)
@@ -1,41 +1,39 @@
-#!/bin/sh
+#!/usr/bin/env bash
 
-# Copyright 2019 ETSI
+#   Copyright 2020 Telefónica Investigación y Desarrollo S.A.U.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#       http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-# implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-WAIT_TIME=340  # LCM healthcheck needs 2x(30+140) senconds
-SERVICES_WITH_HEALTH="nbi ro zookeeper lcm mon pol kafka"
-NUM_SERVICES_WITH_HEALTH=$(echo $SERVICES_WITH_HEALTH | wc -w)
-WAIT_FINAL=30
-OSM_DEPLOYMENT="nbi lcm ro mon pol keystone"
-OSM_STATEFULSET="zookeeper kafka mongo mysql prometheus"
-NUM_K8S_PODS=$(echo $OSM_DEPLOYMENT $OSM_STATEFULSET | wc -w)
-
-while getopts "w:s:n:c:k" o; do
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+# Default values
+sampling_period=5       # seconds
+time_for_readiness=2    # minutes ready
+time_for_failure=5      # minutes broken
+KUBERNETES=             # By default, assumes Docker Swarm installation
+STACK_NAME=osm          # By default, "osm"
+
+while getopts "p:r:f:s:k" o; do
     case "${o}" in
-        w)
-            WAIT_TIME=${OPTARG}
+        p)
+            sampling_period=${OPTARG}
             ;;
-        s)
-            STACK_NAME=${OPTARG}
+        r)
+            time_for_readiness=${OPTARG}
             ;;
-        n)
-            NUM_SERVICES_WITH_HEALTH=${OPTARG}
+        f)
+            time_for_failure=${OPTARG}
             ;;
-        c)
-            SERVICES_WITH_HEALTH="${OPTARG}"
+        s)
+            STACK_NAME=${OPTARG}
             ;;
         k)
             KUBERNETES="y"
@@ -43,51 +41,107 @@ while getopts "w:s:n:c:k" o; do
     esac
 done
 
+oks_threshold=$((time_for_readiness*60/${sampling_period}))     # No. ok samples to declare the system ready
+failures_threshold=$((time_for_failure*60/${sampling_period}))  # No. nok samples to declare the system broken
+failures_in_a_row=0
+oks_in_a_row=0
+
+
+####################################################################################
+# Loop to check system readiness
+####################################################################################
+while [[ (${failures_in_a_row} -lt ${failures_threshold}) && (${oks_in_a_row} -lt ${oks_threshold}) ]]
+do
+
+    #------------ CHECKS FOR KUBERNETES INSTALLATION
+    if [ -n "$KUBERNETES" ]
+    then
 
-time=0
-step=2
-while [ $time -le "$WAIT_TIME" ]; do
-    if [ -n "$KUBERNETES" ]; then
-        if [ "$(kubectl get pods -n "${STACK_NAME}" | grep -i running | wc -l)" -ge "$NUM_K8S_PODS" ]; then
-            #all pods are running now.
-            sleep $WAIT_FINAL
-            exit 0
+        # State of Deployments
+        DEPLOYMENTS_STATE=$(kubectl get deployment -n ${STACK_NAME} --no-headers 2>&1)
+        DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2=="1/1" && $4=="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
+        DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2!="1/1" || $4!="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
+        COUNT_DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_READY}"| grep -v -e '^$' | wc -l)
+        COUNT_DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_NOT_READY}" | grep -v -e '^$' | wc -l)
+
+        # State of Statefulsets
+        STS_STATE=$(kubectl get statefulset -n ${STACK_NAME} --no-headers 2>&1)
+        STS_READY=$(echo "${STS_STATE}" | awk '$2=="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
+        STS_NOT_READY=$(echo "${STS_STATE}" | awk '$2!="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
+        COUNT_STS_READY=$(echo "${STS_READY}" | grep -v -e '^$' | wc -l)
+        COUNT_STS_NOT_READY=$(echo "${STS_NOT_READY}" | grep -v -e '^$' | wc -l)
+
+        # OK sample
+        if [[ $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_STS_NOT_READY})) -eq 0 ]]
+        then
+            ((++oks_in_a_row))
+            failures_in_a_row=0
+            echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r
+        # NOK sample
+        else
+            ((++failures_in_a_row))
+            oks_in_a_row=0
+            echo
+            echo Bootstraping... "${failures_in_a_row}" attempts of ${failures_threshold}
+
+            # Reports failed deployments
+            if [[ "${COUNT_DEPLOYMENTS_NOT_READY}" -ne 0 ]]
+            then
+                echo ${COUNT_DEPLOYMENTS_NOT_READY} of $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_DEPLOYMENTS_READY})) deployments starting:
+                echo "${DEPLOYMENTS_NOT_READY}"
+                echo
+            fi
+
+            # Reports failed statefulsets
+            if [[ "${COUNT_STS_NOT_READY}" -ne 0 ]]
+            then
+                echo ${COUNT_STS_NOT_READY} of $((${COUNT_STS_NOT_READY}+${COUNT_STS_READY})) statefulsets starting:
+                echo "${STS_NOT_READY}"
+                echo
+            fi
         fi
+
+    #------------ CHECKS FOR DOCKER SWARM INSTALLATION
     else
-        if [ "$(sg docker -c "docker ps" | grep " ${STACK_NAME}_" | grep -i healthy | wc -l)" -ge "$NUM_SERVICES_WITH_HEALTH" ]; then
-            # all dockers are healthy now.
-            # final sleep is needed until more health checks are added to validate system is ready to handle requests
-            sleep $WAIT_FINAL
-            exit 0
+        # State of Docker Services
+        SERVICES_STATE=$(sg docker -c "docker service ls" 2>&1 | grep " ${STACK_NAME}_")
+        SERVICES_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4=="1/1" {printf ("%20s\t%s\n", $2, $4)}')
+        SERVICES_NOT_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4!="1/1" {printf ("%20s\t%s\n", $2, $4)}')
+        COUNT_SERVICES_READY=$(echo "${SERVICES_READY}" | grep -v -e '^$' | wc -l)
+        COUNT_SERVICES_NOT_READY=$(echo "${SERVICES_NOT_READY}" | grep -v -e '^$' | wc -l)
+
+        # OK sample
+        if [[ ${COUNT_SERVICES_NOT_READY} -eq 0 ]]
+        then
+            ((++oks_in_a_row))
+            failures_in_a_row=0
+            echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r
+        # NOK sample
+        else
+            ((++failures_in_a_row))
+            oks_in_a_row=0
+            echo
+            echo Bootstraping...  "${failures_in_a_row}" attempts of ${failures_threshold}
+            echo ${COUNT_SERVICES_NOT_READY} of $((${COUNT_SERVICES_NOT_READY}+${COUNT_SERVICES_READY})) services starting:
+            echo "${SERVICES_NOT_READY}"
         fi
     fi
 
-    sleep $step
-    time=$((time+step))
+    #------------ NEXT SAMPLE
+    sleep ${sampling_period}
+
 done
 
-if [ -n "$KUBERNETES" ]; then
-    echo "Not all pods are running"
-    kubectl get pods -n "${STACK_NAME}"
-    for POD in $OSM_DEPLOYMENT $OSM_STATEFULSET; do
-        kubectl get pods -n "${STACK_NAME}" | grep -i running | grep -q ^"${POD}-" && continue
-        echo
-        echo BEGIN LOGS of pods ${POD} not running
-        LOG_POD=$(kubectl get pods -n "${STACK_NAME}" | grep -e ^"${POD}-" | awk '{print $1}' )
-        [ -z "$LOG_POD" ] && echo "${POD} Failed to deploy" || kubectl logs ${LOG_POD} -n $STACK_NAME 2>&1 | tail -n 100
-        echo END LOGS of services $POD not running
-    done
+
+####################################################################################
+# OUTCOME
+####################################################################################
+if [[ (${failures_in_a_row} -ge ${failures_threshold}) ]]
+then
+    echo
+    echo SYSTEM IS BROKEN
+    exit 1
 else
-    echo "Not all Docker services are healthy"
-    sg docker -c "docker ps" | grep " ${STACK_NAME}_"
-    for S_WITH_HEALTH in $SERVICES_WITH_HEALTH ; do
-        sg docker -c "docker ps" | grep " ${STACK_NAME}_" | grep -i healthy | grep -q "_${S_WITH_HEALTH}."  && continue
-        echo
-        echo BEGIN LOGS of container ${S_WITH_HEALTH} not healthy
-        sg docker -c "docker service logs ${STACK_NAME}_${S_WITH_HEALTH} 2>&1" | tail -n 100
-        echo END LOGS of container ${S_WITH_HEALTH} not healthy
-        echo
-    done
+    echo
+    echo SYSTEM IS READY
 fi
-
-exit 1