| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 1 | #!/usr/bin/env bash |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 2 | |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 3 | # Copyright 2020 Telefónica Investigación y Desarrollo S.A.U. |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 4 | # |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 8 | # |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 10 | # |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 16 | |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 17 | # Default values |
| 18 | sampling_period=5 # seconds |
| 19 | time_for_readiness=2 # minutes ready |
| garciadeblas | 6e2e007 | 2021-09-07 18:58:35 +0200 | [diff] [blame] | 20 | time_for_failure=7 # minutes broken |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 21 | KUBERNETES= # By default, assumes Docker Swarm installation |
| 22 | STACK_NAME=osm # By default, "osm" |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 23 | |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 24 | while getopts "p:r:f:s:k" o; do |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 25 | case "${o}" in |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 26 | p) |
| 27 | sampling_period=${OPTARG} |
| 28 | ;; |
| 29 | r) |
| 30 | time_for_readiness=${OPTARG} |
| 31 | ;; |
| 32 | f) |
| 33 | time_for_failure=${OPTARG} |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 34 | ;; |
| 35 | s) |
| 36 | STACK_NAME=${OPTARG} |
| 37 | ;; |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 38 | k) |
| 39 | KUBERNETES="y" |
| 40 | ;; |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 41 | esac |
| 42 | done |
| 43 | |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 44 | oks_threshold=$((time_for_readiness*60/${sampling_period})) # No. ok samples to declare the system ready |
| 45 | failures_threshold=$((time_for_failure*60/${sampling_period})) # No. nok samples to declare the system broken |
| 46 | failures_in_a_row=0 |
| 47 | oks_in_a_row=0 |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 48 | |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 49 | |
| 50 | #################################################################################### |
| 51 | # Loop to check system readiness |
| 52 | #################################################################################### |
| 53 | while [[ (${failures_in_a_row} -lt ${failures_threshold}) && (${oks_in_a_row} -lt ${oks_threshold}) ]] |
| 54 | do |
| 55 | |
| 56 | #------------ CHECKS FOR KUBERNETES INSTALLATION |
| 57 | if [ -n "$KUBERNETES" ] |
| 58 | then |
| 59 | |
| 60 | # State of Deployments |
| 61 | DEPLOYMENTS_STATE=$(kubectl get deployment -n ${STACK_NAME} --no-headers 2>&1) |
| 62 | DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2=="1/1" && $4=="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') |
| 63 | DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_STATE}" | awk '$2!="1/1" || $4!="1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') |
| 64 | COUNT_DEPLOYMENTS_READY=$(echo "${DEPLOYMENTS_READY}"| grep -v -e '^$' | wc -l) |
| 65 | COUNT_DEPLOYMENTS_NOT_READY=$(echo "${DEPLOYMENTS_NOT_READY}" | grep -v -e '^$' | wc -l) |
| 66 | |
| 67 | # State of Statefulsets |
| 68 | STS_STATE=$(kubectl get statefulset -n ${STACK_NAME} --no-headers 2>&1) |
| 69 | STS_READY=$(echo "${STS_STATE}" | awk '$2=="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') |
| 70 | STS_NOT_READY=$(echo "${STS_STATE}" | awk '$2!="1/1" {printf ("%20s\t%s\t%s\n", $1, $2, $4)}') |
| 71 | COUNT_STS_READY=$(echo "${STS_READY}" | grep -v -e '^$' | wc -l) |
| 72 | COUNT_STS_NOT_READY=$(echo "${STS_NOT_READY}" | grep -v -e '^$' | wc -l) |
| 73 | |
| 74 | # OK sample |
| 75 | if [[ $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_STS_NOT_READY})) -eq 0 ]] |
| 76 | then |
| 77 | ((++oks_in_a_row)) |
| 78 | failures_in_a_row=0 |
| 79 | echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r |
| 80 | # NOK sample |
| 81 | else |
| 82 | ((++failures_in_a_row)) |
| 83 | oks_in_a_row=0 |
| 84 | echo |
| 85 | echo Bootstraping... "${failures_in_a_row}" attempts of ${failures_threshold} |
| 86 | |
| 87 | # Reports failed deployments |
| 88 | if [[ "${COUNT_DEPLOYMENTS_NOT_READY}" -ne 0 ]] |
| 89 | then |
| 90 | echo ${COUNT_DEPLOYMENTS_NOT_READY} of $((${COUNT_DEPLOYMENTS_NOT_READY}+${COUNT_DEPLOYMENTS_READY})) deployments starting: |
| 91 | echo "${DEPLOYMENTS_NOT_READY}" |
| 92 | echo |
| 93 | fi |
| 94 | |
| 95 | # Reports failed statefulsets |
| 96 | if [[ "${COUNT_STS_NOT_READY}" -ne 0 ]] |
| 97 | then |
| 98 | echo ${COUNT_STS_NOT_READY} of $((${COUNT_STS_NOT_READY}+${COUNT_STS_READY})) statefulsets starting: |
| 99 | echo "${STS_NOT_READY}" |
| 100 | echo |
| 101 | fi |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 102 | fi |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 103 | |
| 104 | #------------ CHECKS FOR DOCKER SWARM INSTALLATION |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 105 | else |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 106 | # State of Docker Services |
| 107 | SERVICES_STATE=$(sg docker -c "docker service ls" 2>&1 | grep " ${STACK_NAME}_") |
| 108 | SERVICES_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4=="1/1" {printf ("%20s\t%s\n", $2, $4)}') |
| 109 | SERVICES_NOT_READY=$(echo "${SERVICES_STATE}" | awk '$3=="replicated" && $4!="1/1" {printf ("%20s\t%s\n", $2, $4)}') |
| 110 | COUNT_SERVICES_READY=$(echo "${SERVICES_READY}" | grep -v -e '^$' | wc -l) |
| 111 | COUNT_SERVICES_NOT_READY=$(echo "${SERVICES_NOT_READY}" | grep -v -e '^$' | wc -l) |
| 112 | |
| 113 | # OK sample |
| 114 | if [[ ${COUNT_SERVICES_NOT_READY} -eq 0 ]] |
| 115 | then |
| 116 | ((++oks_in_a_row)) |
| 117 | failures_in_a_row=0 |
| 118 | echo -ne ===\> Successful checks: "${oks_in_a_row}"/${oks_threshold}\\r |
| 119 | # NOK sample |
| 120 | else |
| 121 | ((++failures_in_a_row)) |
| 122 | oks_in_a_row=0 |
| 123 | echo |
| 124 | echo Bootstraping... "${failures_in_a_row}" attempts of ${failures_threshold} |
| 125 | echo ${COUNT_SERVICES_NOT_READY} of $((${COUNT_SERVICES_NOT_READY}+${COUNT_SERVICES_READY})) services starting: |
| 126 | echo "${SERVICES_NOT_READY}" |
| vijaynag | 8339ed2 | 2019-07-25 17:10:58 +0530 | [diff] [blame] | 127 | fi |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 128 | fi |
| 129 | |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 130 | #------------ NEXT SAMPLE |
| 131 | sleep ${sampling_period} |
| 132 | |
| Mike Marchetti | 9d9192b | 2018-09-21 12:03:05 -0400 | [diff] [blame] | 133 | done |
| Mike Marchetti | 37c3f51 | 2018-09-24 10:27:00 -0400 | [diff] [blame] | 134 | |
| tierno | bc983ec | 2018-10-11 15:03:06 +0200 | [diff] [blame] | 135 | |
| ramonsalguer | 917ce8c | 2020-07-16 14:42:04 +0200 | [diff] [blame] | 136 | #################################################################################### |
| 137 | # OUTCOME |
| 138 | #################################################################################### |
| 139 | if [[ (${failures_in_a_row} -ge ${failures_threshold}) ]] |
| 140 | then |
| 141 | echo |
| 142 | echo SYSTEM IS BROKEN |
| 143 | exit 1 |
| 144 | else |
| 145 | echo |
| 146 | echo SYSTEM IS READY |
| 147 | fi |