Skip to content
Snippets Groups Projects
Commit 917ce8ce authored by Francisco-Javier Ramon Salguero's avatar Francisco-Javier Ramon Salguero Committed by Mark Beierl
Browse files

Fix bug 1560 to ensure that OSM is healthy after installation


This change updates health-check to determine more accurately when the system
is ready. All the deployments and statefulsets must have all its pods running
and stable during some time.

Change-Id: Ibe7ab30415ce81fd95caa0fdd66710746de7605b
Signed-off-by: default avatarramonsalguer <javier.ramon@telefonica.com>
Signed-off-by: default avatargarciadeblas <gerardo.garciadeblas@telefonica.com>
parent 0062727f
No related branches found
No related tags found
No related merge requests found
#!/bin/sh
#!/usr/bin/env bash
# Copyright 2019 ETSI
# Copyright 2020 Telefónica Investigación y Desarrollo S.A.U.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
WAIT_TIME=340 # LCM healthcheck needs 2x(30+140) senconds
SERVICES_WITH_HEALTH="nbi ro zookeeper lcm mon pol kafka"
NUM_SERVICES_WITH_HEALTH=$(echo $SERVICES_WITH_HEALTH | wc -w)
WAIT_FINAL=30
OSM_DEPLOYMENT="nbi lcm ro mon pol keystone"
OSM_STATEFULSET="zookeeper kafka mongo mysql prometheus"
NUM_K8S_PODS=$(echo $OSM_DEPLOYMENT $OSM_STATEFULSET | wc -w)
while getopts "w:s:n:c:k" o; do
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default values
sampling_period=5 # seconds
time_for_readiness=2 # minutes ready
time_for_failure=5 # minutes broken
KUBERNETES= # By default, assumes Docker Swarm installation
STACK_NAME=osm # By default, "osm"
while getopts "p:r:f:s:k" o; do
case "${o}" in
w)
WAIT_TIME=${OPTARG}
p)
sampling_period=${OPTARG}
;;
s)
STACK_NAME=${OPTARG}
r)
time_for_readiness=${OPTARG}
;;
n)
NUM_SERVICES_WITH_HEALTH=${OPTARG}
f)
time_for_failure=${OPTARG}
;;
c)
SERVICES_WITH_HEALTH="${OPTARG}"
s)
STACK_NAME=${OPTARG}
;;
k)
KUBERNETES="y"
......@@ -43,51 +41,107 @@ while getopts "w:s:n:c:k" o; do
esac
done
oks_threshold=$((time_for_readiness*60/${sampling_period})) # No. ok samples to declare the system ready
failures_threshold=$((time_for_failure*60/${sampling_period})) # No. nok samples to declare the system broken
failures_in_a_row=0
oks_in_a_row=0
####################################################################################
# Loop to check system readiness
####################################################################################
while [[ (${failures_in_a_row} -lt ${failures_threshold}) && (${oks_in_a_row} -lt ${oks_threshold}) ]]
do
#------------ CHECKS FOR KUBERNETES INSTALLATION
if [ -n "$KUBERNETES" ]
then
time=0
step=2
while [ $time -le "$WAIT_TIME" ]; do
if [ -n "$KUBERNETES" ]; then
if [ "$(kubectl get pods -n "${STACK_NAME}" | grep -i running | wc -l)" -ge "$NUM_K8S_PODS" ]; then
#all pods are running now.
sleep $WAIT_FINAL
exit 0
# State of Deployments
DEPLOYMENTS_STATE=$(kubectl get deployment -n ${STACK_NAME} --no-headers 2>&1)
DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2=="1/1" && $4=="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2!="1/1" || $4!="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
COUNT_DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_READY}"| grep -v -e '^$' | wc -l)
COUNT_DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_NOT_READY}" | grep -v -e '^$' | wc -l)
# State of Statefulsets
STS_STATE=$(kubectl get statefulset -n ${STACK_NAME} --no-headers 2>&1)
STS_READY=$(echo "${STS_STATE}" | awk '$2=="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
STS_NOT_READY=$(echo "${STS_STATE}" | awk '$2!="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}')
COUNT_STS_READY=$(echo "${STS_READY}" | grep -v -e '^$' | wc -l)
COUNT_STS_NOT_READY=$(echo "${STS_NOT_READY}" | grep -v -e '^$' | wc -l)
# OK sample
if [[ $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_STS_NOT_READY})) -eq 0 ]]
then
((++oks_in_a_row))
failures_in_a_row=0
echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r
# NOK sample
else
((++failures_in_a_row))
oks_in_a_row=0
echo
echo Bootstraping... "${failures_in_a_row}" attempts of ${failures_threshold}
# Reports failed deployments
if [[ "${COUNT_DEPLOYMENTS_NOT_READY}" -ne 0 ]]
then
echo ${COUNT_DEPLOYMENTS_NOT_READY} of $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_DEPLOYMENTS_READY})) deployments starting:
echo "${DEPLOYMENTS_NOT_READY}"
echo
fi
# Reports failed statefulsets
if [[ "${COUNT_STS_NOT_READY}" -ne 0 ]]
then
echo ${COUNT_STS_NOT_READY} of $((${COUNT_STS_NOT_READY}+${COUNT_STS_READY})) statefulsets starting:
echo "${STS_NOT_READY}"
echo
fi
fi
#------------ CHECKS FOR DOCKER SWARM INSTALLATION
else
if [ "$(sg docker -c "docker ps" | grep " ${STACK_NAME}_" | grep -i healthy | wc -l)" -ge "$NUM_SERVICES_WITH_HEALTH" ]; then
# all dockers are healthy now.
# final sleep is needed until more health checks are added to validate system is ready to handle requests
sleep $WAIT_FINAL
exit 0
# State of Docker Services
SERVICES_STATE=$(sg docker -c "docker service ls" 2>&1 | grep " ${STACK_NAME}_")
SERVICES_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4=="1/1" {printf ("%20s\t%s\n", $2, $4)}')
SERVICES_NOT_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4!="1/1" {printf ("%20s\t%s\n", $2, $4)}')
COUNT_SERVICES_READY=$(echo "${SERVICES_READY}" | grep -v -e '^$' | wc -l)
COUNT_SERVICES_NOT_READY=$(echo "${SERVICES_NOT_READY}" | grep -v -e '^$' | wc -l)
# OK sample
if [[ ${COUNT_SERVICES_NOT_READY} -eq 0 ]]
then
((++oks_in_a_row))
failures_in_a_row=0
echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r
# NOK sample
else
((++failures_in_a_row))
oks_in_a_row=0
echo
echo Bootstraping... "${failures_in_a_row}" attempts of ${failures_threshold}
echo ${COUNT_SERVICES_NOT_READY} of $((${COUNT_SERVICES_NOT_READY}+${COUNT_SERVICES_READY})) services starting:
echo "${SERVICES_NOT_READY}"
fi
fi
sleep $step
time=$((time+step))
#------------ NEXT SAMPLE
sleep ${sampling_period}
done
if [ -n "$KUBERNETES" ]; then
echo "Not all pods are running"
kubectl get pods -n "${STACK_NAME}"
for POD in $OSM_DEPLOYMENT $OSM_STATEFULSET; do
kubectl get pods -n "${STACK_NAME}" | grep -i running | grep -q ^"${POD}-" && continue
echo
echo BEGIN LOGS of pods ${POD} not running
LOG_POD=$(kubectl get pods -n "${STACK_NAME}" | grep -e ^"${POD}-" | awk '{print $1}' )
[ -z "$LOG_POD" ] && echo "${POD} Failed to deploy" || kubectl logs ${LOG_POD} -n $STACK_NAME 2>&1 | tail -n 100
echo END LOGS of services $POD not running
done
####################################################################################
# OUTCOME
####################################################################################
if [[ (${failures_in_a_row} -ge ${failures_threshold}) ]]
then
echo
echo SYSTEM IS BROKEN
exit 1
else
echo "Not all Docker services are healthy"
sg docker -c "docker ps" | grep " ${STACK_NAME}_"
for S_WITH_HEALTH in $SERVICES_WITH_HEALTH ; do
sg docker -c "docker ps" | grep " ${STACK_NAME}_" | grep -i healthy | grep -q "_${S_WITH_HEALTH}." && continue
echo
echo BEGIN LOGS of container ${S_WITH_HEALTH} not healthy
sg docker -c "docker service logs ${STACK_NAME}_${S_WITH_HEALTH} 2>&1" | tail -n 100
echo END LOGS of container ${S_WITH_HEALTH} not healthy
echo
done
echo
echo SYSTEM IS READY
fi
exit 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment