SNMP based health check

Problem this snippet solves:

Allows SNMP based health checking combined of multiple scripts to allow decoupeling between LTM and the SNMP checks

  1. hc_status.sh: checks the CACHED status of member 2. snmp_poller_mgr.sh: runs multiple pollers to check members status 3. snmp_poller.sh: a script to check the member status 4. config file: a sample configuration file 5. Sample log rotate configuration

For more help... lahavs - at - emindsys.com

Code :

=========== Script #1: hc_status.sh ===========
#!/bin/sh

# API
# ========================
dest_ip=$(echo $1 | sed 's/::ffff://');
dest_port=$2;
svc_name=$3;
pid=$$

status_file=/var/run/hc-${dest_ip}-${svc_name}.status
log_file=/var/log/hc-status.log

function write_log ()
{
echo "$(date +%Y-%m-%d) $(date +%T) ${pid} ${svc_name} ${dest_ip} ${dest_port} $*" >> $log_file
}


if [ -f ${status_file} ]; then
status=$(cat ${status_file} | cut -d ':' -f 2)
else
status=DOWN
fi
write_log ${status}
echo ${status}
; echo UP
exit 0

=========== Script #2: snmp_poller_mgr.sh ===========
#!/bin/sh

# API
# ========================
svc_name=$1
conf_file=/usr/local/ixi/${svc_name}

pid=$$
log_file=/var/log/hc-mgr.log
pidfile=/var/run/hc-mgr-${svc_name}.pid
snmp_poller=/usr/local/ixi/snmp_poller.sh

function read_conf_param () {
param=$1
write_log "Reading ${param} from ${conf_file}"
# data=$(cat ${conf_file} | grep -v "#" | grep "${param}=" | cut -d "=" -f2)
data=$(cat ${conf_file} | grep -v "#" | grep "${param}=" | awk -F '=' '{ print $2 }' )
echo "${data}"
}

function read_conf () {
curr_checksum=$(md5sum ${conf_file})
if [ "${checksum}" = "${curr_checksum}" ]; then
return
fi
checksum=${curr_checksum}

oid=$(read_conf_param oid)
write_log "oid=${oid}"

expected_val=$(read_conf_param expected_val)
write_log "expected_val=${expected_val}"

nodes=$(read_conf_param nodes)
write_log "nodes=${nodes}"

community=$(read_conf_param community)
write_log "community=${community}"

interval=$(read_conf_param interval)
write_log "interval=${interval}"
}

function write_log ()
{
msg=$*
echo "$(date +%Y-%m-%d) $(date +%T) ${pid} ${svc_name} ${msg}" >> $log_file
}

function init ()
{
write_log "=== snmp poller manager started ==="
if [ "${svc_name}" = "" ]; then
write_log "svc_name not set"
exit
fi

if [ "${conf_file}" = "" ] || ! [ -f "${conf_file}" ]; then
write_log "conf_file ($conf_file) not set or not exist"
exit
fi

if [ -f $pidfile ]; then
write_log "${pidfile} exist"
prev_pid=$(cat $pidfile)
kill -0 ${prev_pid} > /dev/null 2>&1
err=$?
if [ $err -eq 0 ]; then
write_log "Aborting: PID $(cat $pidfile) is running, error code:${err}"
exit 1
fi
fi
write_log "Setting PID file ${pidfile}"
echo ${pid} > ${pidfile}
}

function set_sleep_time () {
sleep_time=$(expr ${start_time} + ${interval} - $(date +%s))
}

function main () {
write_log "=== Starting main loop ==="
while [ 1 ]; do
read_conf;
start_time=$(date +%s)
for host in ${nodes}; do
write_log "Polling started on" "host=${host}"
${snmp_poller} ${host} ${svc_name} ${expected_val} ${community} ${oid} &
sleep 2
done
set_sleep_time
write_log "Going to sleep for ${sleep_time} sec."
sleep ${sleep_time}
done
}
init;
main;

=========== Script #3: snmp_poller.sh ===========

#!/bin/sh

# application API
# --------------------
dest_ip=$1
svc_name=$2
expected_val=$3
community=$4
oid=$5

timeout=2
up=UP
down=DOWN
delay=3


pid=$$
pidfile=/var/run/hc-${dest_ip}-${svc_name}.pid
status_file=/var/run/hc-${dest_ip}-${svc_name}.status
state_file=/var/run/hc-${dest_ip}-${svc_name}.last_state
log_file=/var/log/hc-${svc_name}.log

function write_log ()
{
echo "$(date +%Y-%m-%d) $(date +%T) ${pid} ${dest_ip}-${svc_name} $*" >> $log_file
}

function init ()
{
write_log "monitor started"

if [ -f $pidfile ]; then
write_log "${pidfile} exist"
prev_pid=$(cat $pidfile)
kill -9 ${prev_pid} > /dev/null 2>&1
err=$?
write_log "ERROR: PID $(cat $pidfile) killed, error code:${err}"
fi
echo ${pid} > $pidfile

}

function run_snmp_test ()
{
cmd="snmpget -O qv -t ${timeout} -r 1 -v2c -c ${community} ${dest_ip}:161 ${oid}"
result=$(${cmd})
err=$?

if [ ${err} -ne 0 ]; then

write_log "ERROR: snmpget existed with error code:${err}"
status=${down}

else

if [ "${result}" = "${expected_val}" ]; then
# write_log "OK - RESULT:${result} as expected"
status=${up}
else
write_log "FAIL - RESULT:${result} != ${expected_val}"
status=${down}
fi

fi
}

function response ()
{
write_log "Current Status:${status} file:${status_file}"

if [ "${status}" != "${last_state_value}" ]; then
# Last State is different than status
   write_log "Status mismatch status:${status} last_state_value=${last_state_value} last_state_count:${last_state_count}"
   
   if [ ${last_state_count} -lt ${delay} ]; then
       # State change detected, using old state, increasing counter
       write_log "Status mismatch delaying status:${status}, using ${last_state_value}"
       count=$(expr ${last_state_count} + 1)
       echo $(date +%s):${last_state_value}:${count} > ${state_file}
       echo $(date +%s):${last_state_value} > ${status_file}  
   
       else
      # Max delay reached, setting current status to be active
      write_log "Status mismatch max delay reached using, status:${status}"
       echo $(date +%s):${status}:0 > ${state_file}
       echo $(date +%s):${status} > ${status_file}
   fi
   
else
   write_log "Status match status:${status} last_state_value=${last_state_value} last_state_count:${last_state_count}"
       echo $(date +%s):${status}:0 > ${state_file}     
       echo $(date +%s):${status} > ${status_file}        
fi
}

function get_last_state ()
{
if [ -e ${state_file} ]; then
        last_state_time=$(cat ${state_file} | cut -d ':' -f 1)
        last_state_value=$(cat ${state_file} | cut -d ':' -f 2)
        last_state_count=$(cat ${state_file} | cut -d ':' -f 3)
else
   echo $(date +%s):${status}:0 > ${state_file}
fi
}


function cleanup ()
{
# write_log "Deleting ${pidfile}"
rm -f ${pidfile}
}

function main ()
{
init;
run_snmp_test;
get_last_state;
response;
cleanup;
# write_log "=== monitor ended ==="
exit 0
}
main

=========== Sample config file ===========

# Configutation file for AIM TR
# ----------------------------
oid=.1.3.6.1.4.1.15867.2000.3.5.2.1.2.1.0
expected_val=1
nodes=172.18.20.10 172.18.20.20 172.18.20.30 172.18.20.40
community=public
interval=26

=========== Log rotate configuation ===========
/var/log/hc-*.log {
    rotate 3
    daily
}
Published Mar 18, 2015
Version 1.0
No CommentsBe the first to comment