#!/bin/bash
#Roy Cohen                                      :roy@wondercohen.nl
#objective                                      :OpenShift patching tool using yum (RHEL and Centos)
#First line of code                             :04/09/2019
#last update                                    :06/09/2019
#version                                        :0.1
#inspred by  /home/freark/bin/batch_upgrade.sh


#Check if root is running the script
if [[ $EUID -ne 0 ]]; then
   echo "This script must be run as root" 
   exit 1
fi

# Parameters
NAGIOS_SCRIPT='/nagios_add_downtime_host.sh'
BASE_LOG_DIR=/var/log/batch_upgrade
TIMESTAMP=$(date +%Y%m%d.%H%M)

#set the master host
read -p "Enter OpenShift master: " OS_MASTER

#Set node varible
OS_NODES_READY=$(ssh -n $OS_MASTER "oc get nodes" |grep "Ready" | cut -d " " -f1)
COUNT_OF_OS_NODES_DRAINED=$(ssh -n $OS_MASTER "oc get nodes" |egrep "NotReady|SchedulingDisabled" |wc -l)
#TODO what if there are more that one node already driand?
if [[ "$COUNT_OF_OS_NODES_DRAINED" -gt 0 ]] ; then
  echo -e "n\ NOTE: There are driand nodes on the cluster\n
    Please chack the master node for possible issues"
    exit 1
fi

###############################OPENSHIFT PACHING BEGINS HERE###############################
#list nodes on the OpenShift Master
function list_nodes()
{
  printf '%s\n' "${OS_NODES_READY[@]}"
  set_update_host
}

#set the hostname for the update
function set_update_host()
{
echo "Set the OpenShift node that needs to updated or Press ^c to bailout/finish"
read -p "Enter OpenShift node: " HOST_TO_UPDATE
}

#Breack in an error
function pause_on_error {
    if [ $1 -ne 0 ]
    then
        echo -e "\e[31mThe last command resulted in an error.\e[0m Press ^c to bailout or enter to continue"
        read
    fi
}

#Add nagios downtime
function nagios_downtime()
{
  $NAGIOS_SCRIPT --host=$HOST_TO_UPDATE --comment='Batch Updating Host'
  echo "Adding downtime for host $HOST_TO_UPDATE..."
  pause_on_error $?
}

#just a counter, one minute
function time_counter()
{
count=0
total=120

  while [ $count -lt $total ]; do
    sleep 0.5 # this is work
    count=$(( $count + 1 ))
    pd=$(( $count * 60 / $total ))
    printf "\r${pd}s" $(( $count * 60 / $total ))
  done
}

#Drain the node
function drain()
{
  echo "Draining $HOST_TO_UPDATE"
  ssh -n $OS_MASTER "oc adm drain --ignore-daemonsets $HOST_TO_UPDATE"
  echo "
 ________________________________________________________________________
< This will take a minute, Just making sure that the node is fully drained >
 ------------------------------------------------------------------------
        \   ^__^
         \  (oo)\_______
            (__)\       )\/
                ||----w |
                ||     ||

  "
  time_counter
  pause_on_error $?
}

#Update node
function update()
{
  mkdir -p ${BASE_LOG_DIR}
  LOG_DIR=$(mktemp -d ${BASE_LOG_DIR}/${TIMESTAMP}.XXXX)

  echo -e "n\I'm now going to update $HOST_TO_UPDATE."
  echo "You can follow/check logs in:"
  echo ${LOG_DIR}
  echo "Use for instance 'tail -f ${LOG_DIR}/$HOST_TO_UPDATE' in a different terminal after continuing here."
  echo "If any fail you MUST check and update these before proceeding!"
  echo -e "\e[33mPress ^c to bailout or enter to continue\e[0m"
  read
  echo "This may take some time..."
  echo "Please check the packge list, if docker/openshift/atomic are incluted than, STOP the update by reapling N to yum or Press ^c "
  
  #exclude openshift and docker
  ssh -n $HOST_TO_UPDATE "cat /etc/yum.conf | grep -v exclude >/tmp/yum.conf && echo "exclude= docker* atomic* " >> /tmp/yum.conf && cat /tmp/yum.conf >/etc/yum.conf"
  #update command
  ssh -t $HOST_TO_UPDATE "sleep 10 && yum update" | tee ${LOG_DIR}/$HOST_TO_UPDATE
  # ssh -n $HOST_TO_UPDATE  "sleep 10 && yum noninteractive -y update --security"
  pause_on_error $?
}

#reboot node
function reboot_server()
{
LOG_DIR=$(mktemp -d ${BASE_LOG_DIR}/${TIMESTAMP}.XXXX)
    

  echo "Restarting server $HOST_TO_UPDATE"
  #ANSWER='dummyvalue'
  while [[ -n "${ANSWER}" || ( "${ANSWER}" != 'c' && "${ANSWER}" != 's' ) ]]; do
    echo -e "\e[33mPress 'c' continue, 's' to skip, or ^c to bail out...\e[0m"
      read ANSWER
        if [[ "${ANSWER}" == 's' ]]; then
            echo "Skipping $HOST_TO_UPDATE"
            return 0
        fi
        if [[ "${ANSWER}" == 'c' ]]; then
            echo "Proceeding to reboot $HOST_TO_UPDATE"
            break
        fi
  done
   
    timeout 4 ssh -o ConnectTimeout=3 $HOST_TO_UPDATE "reboot"
    # no pause on reboot, because often you get kicked out of the server too quickly causing an non-zero exitcode
    # pause_on_error $?

    # wait until the server is down
    ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true
    rc=$?
    while [[ $rc == 0 ]]
    do
        echo "Waiting for $HOST_TO_UPDATE to be down..."
        ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true
        rc=$?
        sleep 0.5
    done

    echo "Server $HOST_TO_UPDATE appears to be down..."

    # wait until...
    # server pings again

    ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true
    rc=$?
    while [[ $rc != 0 ]]
    do
      echo "Waiting for $HOST_TO_UPDATE to be back up..."
      ssh -o ConnectTimeout=2 $HOST_TO_UPDATE true
      rc=$?
      sleep 5
    done
}

#uncordon node in the OpenShift master
function uncordon()
{
  echo "------------------------------------"
  echo "Reactivating $HOST_TO_UPDATE in the OpenShift master $OS_MASTER"
  ssh -n $OS_MASTER "oc adm uncordon $HOST_TO_UPDATE"
  list_nodes
  pause_on_error $?
}


############################### -MAIN RUN- ###############################
list_nodes
nagios_downtime
drain
update
reboot_server
uncordon