#!/bin/bash #Roy Cohen :roy@wondercohen.nl #objective :OpenShift patching tool using yum (RHEL and Centos) #First line of code :04/09/2019 #last update :06/09/2019 #version :0.1 #inspred by /home/freark/bin/batch_upgrade.sh #Check if root is running the script if [[ $EUID -ne 0 ]]; then echo "This script must be run as root" exit 1 fi # Parameters NAGIOS_SCRIPT='/nagios_add_downtime_host.sh' BASE_LOG_DIR=/var/log/batch_upgrade TIMESTAMP=$(date +%Y%m%d.%H%M) #set the master host read -p "Enter OpenShift master: " OS_MASTER #Set node varible OS_NODES_READY=$(ssh -n $OS_MASTER "oc get nodes" |grep "Ready" | cut -d " " -f1) COUNT_OF_OS_NODES_DRAINED=$(ssh -n $OS_MASTER "oc get nodes" |egrep "NotReady|SchedulingDisabled" |wc -l) #TODO what if there are more that one node already driand? if [[ "$COUNT_OF_OS_NODES_DRAINED" -gt 0 ]] ; then echo -e "n\ NOTE: There are driand nodes on the cluster\n Please chack the master node for possible issues" exit 1 fi ###############################OPENSHIFT PACHING BEGINS HERE############################### #list nodes on the OpenShift Master function list_nodes() { printf '%s\n' "${OS_NODES_READY[@]}" set_update_host } #set the hostname for the update function set_update_host() { echo "Set the OpenShift node that needs to updated or Press ^c to bailout/finish" read -p "Enter OpenShift node: " HOST_TO_UPDATE } #Breack in an error function pause_on_error { if [ $1 -ne 0 ] then echo -e "\e[31mThe last command resulted in an error.\e[0m Press ^c to bailout or enter to continue" read fi } #Add nagios downtime function nagios_downtime() { $NAGIOS_SCRIPT --host=$HOST_TO_UPDATE --comment='Batch Updating Host' echo "Adding downtime for host $HOST_TO_UPDATE..." pause_on_error $? } #just a counter, one minute function time_counter() { count=0 total=120 while [ $count -lt $total ]; do sleep 0.5 # this is work count=$(( $count + 1 )) pd=$(( $count * 60 / $total )) printf "\r${pd}s" $(( $count * 60 / $total )) done } #Drain the node function drain() { echo "Draining $HOST_TO_UPDATE" ssh -n $OS_MASTER "oc adm drain --ignore-daemonsets $HOST_TO_UPDATE" echo " ________________________________________________________________________ < This will take a minute, Just making sure that the node is fully drained > ------------------------------------------------------------------------ \ ^__^ \ (oo)\_______ (__)\ )\/ ||----w | || || " time_counter pause_on_error $? } #Update node function update() { mkdir -p ${BASE_LOG_DIR} LOG_DIR=$(mktemp -d ${BASE_LOG_DIR}/${TIMESTAMP}.XXXX) echo -e "n\I'm now going to update $HOST_TO_UPDATE." echo "You can follow/check logs in:" echo ${LOG_DIR} echo "Use for instance 'tail -f ${LOG_DIR}/$HOST_TO_UPDATE' in a different terminal after continuing here." echo "If any fail you MUST check and update these before proceeding!" echo -e "\e[33mPress ^c to bailout or enter to continue\e[0m" read echo "This may take some time..." echo "Please check the packge list, if docker/openshift/atomic are incluted than, STOP the update by reapling N to yum or Press ^c " #exclude openshift and docker ssh -n $HOST_TO_UPDATE "cat /etc/yum.conf | grep -v exclude >/tmp/yum.conf && echo "exclude= docker* atomic* " >> /tmp/yum.conf && cat /tmp/yum.conf >/etc/yum.conf" #update command ssh -t $HOST_TO_UPDATE "sleep 10 && yum update" | tee ${LOG_DIR}/$HOST_TO_UPDATE # ssh -n $HOST_TO_UPDATE "sleep 10 && yum noninteractive -y update --security" pause_on_error $? } #reboot node function reboot_server() { LOG_DIR=$(mktemp -d ${BASE_LOG_DIR}/${TIMESTAMP}.XXXX) echo "Restarting server $HOST_TO_UPDATE" #ANSWER='dummyvalue' while [[ -n "${ANSWER}" || ( "${ANSWER}" != 'c' && "${ANSWER}" != 's' ) ]]; do echo -e "\e[33mPress 'c' continue, 's' to skip, or ^c to bail out...\e[0m" read ANSWER if [[ "${ANSWER}" == 's' ]]; then echo "Skipping $HOST_TO_UPDATE" return 0 fi if [[ "${ANSWER}" == 'c' ]]; then echo "Proceeding to reboot $HOST_TO_UPDATE" break fi done timeout 4 ssh -o ConnectTimeout=3 $HOST_TO_UPDATE "reboot" # no pause on reboot, because often you get kicked out of the server too quickly causing an non-zero exitcode # pause_on_error $? # wait until the server is down ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true rc=$? while [[ $rc == 0 ]] do echo "Waiting for $HOST_TO_UPDATE to be down..." ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true rc=$? sleep 0.5 done echo "Server $HOST_TO_UPDATE appears to be down..." # wait until... # server pings again ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true rc=$? while [[ $rc != 0 ]] do echo "Waiting for $HOST_TO_UPDATE to be back up..." ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true rc=$? sleep 5 done } #uncordon node in the OpenShift master function uncordon() { echo "------------------------------------" echo "Reactivating $HOST_TO_UPDATE in the OpenShift master $OS_MASTER" ssh -n $OS_MASTER "oc adm uncordon $HOST_TO_UPDATE" list_nodes pause_on_error $? } ############################### -MAIN RUN- ############################### list_nodes nagios_downtime drain update reboot_server uncordon