From: ramonsalguer Date: Thu, 16 Jul 2020 12:42:04 +0000 (+0200) Subject: Fix bug 1560 to ensure that OSM is healthy after installation X-Git-Tag: release-v11.0-start~53 X-Git-Url: https://osm.etsi.org/gitweb/?p=osm%2Fdevops.git;a=commitdiff_plain;h=917ce8ceb9cfb54d4eb34ebbf2cf144b72881f26 Fix bug 1560 to ensure that OSM is healthy after installation This change updates health-check to determine more accurately when the system is ready. All the deployments and statefulsets must have all its pods running and stable during some time. Change-Id: Ibe7ab30415ce81fd95caa0fdd66710746de7605b Signed-off-by: ramonsalguer Signed-off-by: garciadeblas --- diff --git a/installers/osm_health.sh b/installers/osm_health.sh index e26e762c..5ee3d78d 100755 --- a/installers/osm_health.sh +++ b/installers/osm_health.sh @@ -1,41 +1,39 @@ -#!/bin/sh +#!/usr/bin/env bash -# Copyright 2019 ETSI +# Copyright 2020 Telefónica Investigación y Desarrollo S.A.U. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -WAIT_TIME=340 # LCM healthcheck needs 2x(30+140) senconds -SERVICES_WITH_HEALTH="nbi ro zookeeper lcm mon pol kafka" -NUM_SERVICES_WITH_HEALTH=$(echo $SERVICES_WITH_HEALTH | wc -w) -WAIT_FINAL=30 -OSM_DEPLOYMENT="nbi lcm ro mon pol keystone" -OSM_STATEFULSET="zookeeper kafka mongo mysql prometheus" -NUM_K8S_PODS=$(echo $OSM_DEPLOYMENT $OSM_STATEFULSET | wc -w) - -while getopts "w:s:n:c:k" o; do +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Default values +sampling_period=5 # seconds +time_for_readiness=2 # minutes ready +time_for_failure=5 # minutes broken +KUBERNETES= # By default, assumes Docker Swarm installation +STACK_NAME=osm # By default, "osm" + +while getopts "p:r:f:s:k" o; do case "${o}" in - w) - WAIT_TIME=${OPTARG} + p) + sampling_period=${OPTARG} ;; - s) - STACK_NAME=${OPTARG} + r) + time_for_readiness=${OPTARG} ;; - n) - NUM_SERVICES_WITH_HEALTH=${OPTARG} + f) + time_for_failure=${OPTARG} ;; - c) - SERVICES_WITH_HEALTH="${OPTARG}" + s) + STACK_NAME=${OPTARG} ;; k) KUBERNETES="y" @@ -43,51 +41,107 @@ while getopts "w:s:n:c:k" o; do esac done +oks_threshold=$((time_for_readiness*60/${sampling_period})) # No. ok samples to declare the system ready +failures_threshold=$((time_for_failure*60/${sampling_period})) # No. nok samples to declare the system broken +failures_in_a_row=0 +oks_in_a_row=0 + + +#################################################################################### +# Loop to check system readiness +#################################################################################### +while [[ (${failures_in_a_row} -lt ${failures_threshold}) && (${oks_in_a_row} -lt ${oks_threshold}) ]] +do + + #------------ CHECKS FOR KUBERNETES INSTALLATION + if [ -n "$KUBERNETES" ] + then -time=0 -step=2 -while [ $time -le "$WAIT_TIME" ]; do - if [ -n "$KUBERNETES" ]; then - if [ "$(kubectl get pods -n "${STACK_NAME}" | grep -i running | wc -l)" -ge "$NUM_K8S_PODS" ]; then - #all pods are running now. - sleep $WAIT_FINAL - exit 0 + # State of Deployments + DEPLOYMENTS_STATE=$(kubectl get deployment -n ${STACK_NAME} --no-headers 2>&1) + DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2=="1/1" && $4=="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') + DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2!="1/1" || $4!="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') + COUNT_DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_READY}"| grep -v -e '^$' | wc -l) + COUNT_DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_NOT_READY}" | grep -v -e '^$' | wc -l) + + # State of Statefulsets + STS_STATE=$(kubectl get statefulset -n ${STACK_NAME} --no-headers 2>&1) + STS_READY=$(echo "${STS_STATE}" | awk '$2=="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') + STS_NOT_READY=$(echo "${STS_STATE}" | awk '$2!="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') + COUNT_STS_READY=$(echo "${STS_READY}" | grep -v -e '^$' | wc -l) + COUNT_STS_NOT_READY=$(echo "${STS_NOT_READY}" | grep -v -e '^$' | wc -l) + + # OK sample + if [[ $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_STS_NOT_READY})) -eq 0 ]] + then + ((++oks_in_a_row)) + failures_in_a_row=0 + echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r + # NOK sample + else + ((++failures_in_a_row)) + oks_in_a_row=0 + echo + echo Bootstraping... "${failures_in_a_row}" attempts of ${failures_threshold} + + # Reports failed deployments + if [[ "${COUNT_DEPLOYMENTS_NOT_READY}" -ne 0 ]] + then + echo ${COUNT_DEPLOYMENTS_NOT_READY} of $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_DEPLOYMENTS_READY})) deployments starting: + echo "${DEPLOYMENTS_NOT_READY}" + echo + fi + + # Reports failed statefulsets + if [[ "${COUNT_STS_NOT_READY}" -ne 0 ]] + then + echo ${COUNT_STS_NOT_READY} of $((${COUNT_STS_NOT_READY}+${COUNT_STS_READY})) statefulsets starting: + echo "${STS_NOT_READY}" + echo + fi fi + + #------------ CHECKS FOR DOCKER SWARM INSTALLATION else - if [ "$(sg docker -c "docker ps" | grep " ${STACK_NAME}_" | grep -i healthy | wc -l)" -ge "$NUM_SERVICES_WITH_HEALTH" ]; then - # all dockers are healthy now. - # final sleep is needed until more health checks are added to validate system is ready to handle requests - sleep $WAIT_FINAL - exit 0 + # State of Docker Services + SERVICES_STATE=$(sg docker -c "docker service ls" 2>&1 | grep " ${STACK_NAME}_") + SERVICES_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4=="1/1" {printf ("%20s\t%s\n", $2, $4)}') + SERVICES_NOT_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4!="1/1" {printf ("%20s\t%s\n", $2, $4)}') + COUNT_SERVICES_READY=$(echo "${SERVICES_READY}" | grep -v -e '^$' | wc -l) + COUNT_SERVICES_NOT_READY=$(echo "${SERVICES_NOT_READY}" | grep -v -e '^$' | wc -l) + + # OK sample + if [[ ${COUNT_SERVICES_NOT_READY} -eq 0 ]] + then + ((++oks_in_a_row)) + failures_in_a_row=0 + echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r + # NOK sample + else + ((++failures_in_a_row)) + oks_in_a_row=0 + echo + echo Bootstraping... "${failures_in_a_row}" attempts of ${failures_threshold} + echo ${COUNT_SERVICES_NOT_READY} of $((${COUNT_SERVICES_NOT_READY}+${COUNT_SERVICES_READY})) services starting: + echo "${SERVICES_NOT_READY}" fi fi - sleep $step - time=$((time+step)) + #------------ NEXT SAMPLE + sleep ${sampling_period} + done -if [ -n "$KUBERNETES" ]; then - echo "Not all pods are running" - kubectl get pods -n "${STACK_NAME}" - for POD in $OSM_DEPLOYMENT $OSM_STATEFULSET; do - kubectl get pods -n "${STACK_NAME}" | grep -i running | grep -q ^"${POD}-" && continue - echo - echo BEGIN LOGS of pods ${POD} not running - LOG_POD=$(kubectl get pods -n "${STACK_NAME}" | grep -e ^"${POD}-" | awk '{print $1}' ) - [ -z "$LOG_POD" ] && echo "${POD} Failed to deploy" || kubectl logs ${LOG_POD} -n $STACK_NAME 2>&1 | tail -n 100 - echo END LOGS of services $POD not running - done + +#################################################################################### +# OUTCOME +#################################################################################### +if [[ (${failures_in_a_row} -ge ${failures_threshold}) ]] +then + echo + echo SYSTEM IS BROKEN + exit 1 else - echo "Not all Docker services are healthy" - sg docker -c "docker ps" | grep " ${STACK_NAME}_" - for S_WITH_HEALTH in $SERVICES_WITH_HEALTH ; do - sg docker -c "docker ps" | grep " ${STACK_NAME}_" | grep -i healthy | grep -q "_${S_WITH_HEALTH}." && continue - echo - echo BEGIN LOGS of container ${S_WITH_HEALTH} not healthy - sg docker -c "docker service logs ${STACK_NAME}_${S_WITH_HEALTH} 2>&1" | tail -n 100 - echo END LOGS of container ${S_WITH_HEALTH} not healthy - echo - done + echo + echo SYSTEM IS READY fi - -exit 1