Nagios plug-in : zpool status

With Nagios you can monitor almost everything and philosophy is simple.

Nagios uses plug-ins, say binary/Perl/shell script and check its returning value and according to that determines host/service state. So Nagios doesn't know and it is not interested to know what plug-in is monitoring. Here is the plug-in that checks status of all zpools in the system.

#!/usr/bin/sh
#set -x
# script name zpoolhealth.sh
# -------------------------
# Nagios plugin : determines zpool health
# Nagios plugin return values
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4

# variables
progname=`basename $0`
tmpdir=/tmp
okfile=${tmpdir}/${progname}.ok-zpool
warnfile=${tmpdir}/${progname}.warn-zpool
critfile=${tmpdir}/${progname}.crit-zpool

# --- ERROR SUBROUTINE
err() {
        echo "\n ERROR: $* \n"
        exit 1
}

# --- END SCRIPT WITH OUTPUT
endscript () {
        echo "${RESULT}"
        exit ${EXIT_STATUS}
}

# --- CLEANING SUBROUTINE
tmp_file_cleaning () {
        [ -f ${okfile}.$$ ] && rm ${okfile}.$$
        [ -f ${warnfile}.$$ ] && rm ${warnfile}.$$
        [ -f ${critfile}.$$ ] && rm ${critfile}.$$
}

# --- cleaning in case of script termination and regular exit
trap tmp_file_cleaning HUP INT QUIT ABRT EXIT

# ---- find zpools in the system
myzpools=`zpool list -H | awk '{print $1}'`
if [ "${myzpools}" = "no" ]; then
        echo "There is no zpool(s)"; exit 3
fi

#echo My zpools: ${myzpools}

# --- get zpool health and create temp files
for zp in ${myzpools}
do
        health=`zpool list -H ${zp} | awk '{print $6}'`

        if [ ${health} = ONLINE ]
        then
                printf "${zp} " >> ${okfile}.$$
        elif [ ${health} = DEGRADED ]
        then
                printf "${zp} " >> ${warnfile}.$$
        else
                printf "${zp} " >> ${critfile}.$$
        fi
done

# --- check temp files and create output

# all zpools are online
if [ -f ${okfile}.$$ ] && [ ! -f ${warnfile}.$$ ] && [ ! -f ${critfile}.$$ ]
then
        okpools=`cat ${okfile}.$$`
        RESULT="OK: ${okpools}"
        EXIT_STATUS="${STATE_OK}"

# zpools are online, at least one is degraded and no critical ones
elif [ -f ${okfile}.$$ ] && [ -f ${warnfile}.$$ ] && [ ! -f ${critfile}.$$ ]
then
        warnpools=`cat ${warnfile}.$$`
        okpools=`cat ${okfile}.$$`
        RESULT="WARN(DEGRADED): ${warnpools} OK: ${okpools}"
        EXIT_STATUS="${STATE_WARNING}"

# all zpools are degraded
elif [ ! -f ${okfile}.$$ ] && [ -f ${warnfile}.$$ ] && [ ! -f ${critfile}.$$ ]
then
        warnpools=`cat ${warnfile}.$$`
        RESULT="WARN(DEGRADED) ${warnpools}"
        EXIT_STATUS="${STATE_WARNING}"

# there are zpools in each state
elif [ -f ${okfile}.$$ ] && [ -f ${warnfile}.$$ ] && [ -f ${critfile}.$$ ]
then
        okpools=`cat ${okfile}.$$`
        warnpools=`cat ${warnfile}.$$`
        critpools=`cat ${critfile}.$$`
        RESULT="CRIT(FAULT/OFFLINE/UNAVAIL): ${critpools} WARN(DEGRADED): ${warnpools} OK: ${okpools}"
        EXIT_STATUS="${STATE_CRITICAL}"

# zpools are online and at least one is critical
elif [ -f ${okfile}.$$ ] && [ ! -f ${warnfile}.$$ ] && [ -f ${critfile}.$$ ]
then
        okpools=`cat ${okfile}.$$`
        critpools=`cat ${critfile}.$$`
        RESULT="CRIT(FAULT/OFFLINE/UNAVAIL): ${critpools} OK: ${okpools}"
        EXIT_STATUS="${STATE_CRITICAL}"

# no online zpools, all are degraded and critical
elif [ ! -f ${okfile}.$$ ] && [ -f ${warnfile}.$$ ] && [ -f ${critfile}.$$ ]
then
        warnpools=`cat ${warnfile}.$$`
        critpools=`cat ${critfile}.$$`
        RESULT="CRIT(FAULT/OFFLINE/UNAVAIL): ${critpools} WARN(DEGRADED) ${warnpools}"
        EXIT_STATUS="${STATE_CRITICAL}"

# all zpools are critical
elif [ ! -f ${okfile}.$$ ] && [ ! -f ${warnfile}.$$ ] && [ -f ${critfile}.$$ ]
then
        critpools=`cat ${critfile}.$$`
        RESULT="CRIT(FAULT/OFFLINE/UNAVAIL): ${critpools}"
        EXIT_STATUS="${STATE_CRITICAL}"
fi

# call subroutine to end script
endscript

Usually I create the README file with some info how to deploy plugin, etc.

################################################
README about Nagios plugin zpoolhealth.sh
################################################

1. Copy plugin zpoolhealth.sh to remote host's directory /opt/csw/libexec/nagios-plugins/

2. On the plugin, make permissions 755, owner root:bin

-rwxr-xr-x 1 root bin 3516 Jan 23 10:32 zpoolhealth.sh

3. Add lines to remote host in file /opt/csw/etc/nrpe.cfg

# check zpool status
command[check_zpool_status]=/opt/csw/libexec/nagios-plugins/zpoolhealth.sh

4. Reset NRPE service on remote host

{host}/> svcadm restart cswnrpe

5. Test how NRPE uses plugin on remote host, using CLI from Nagios machine (nagiosbox)

{nagiosbox}/> /opt/csw/nagios/libexec/check_nrpe -H unixlab -c check_zpool_status
OK: space.1 space0

6. Define Nagios service group on nagiosbox, file /etc/nagios/UNIX/services.cfg

define servicegroup{
servicegroup_name zpool_status
alias Zpool status
}

7. Create service so Nagios can check the host, file /etc/nagios/UNIX/services.cfg

define service{
use gen-service
host_name unixlab ;first test on unixlab
#hostgroup_name SUN,CC,FILESERVER ;if test ok, include others, copy plugin there
service_description Zpool status
servicegroups zpool_status
check_command check-nrpe!check_zpool_status
}

8. Refresh nagios service

> svcadm refresh nagios

-- Note:
The script deployplugin.sh can be used to do next on multiple remote hosts:
1. copy plugin to remote host
2. backup nrpe.cfg on remote host
3. append required lines to nrpe.cfg on remote host
4. restart cswnrpe service on remote host

So yes, if you want to deploy plugin on many machines, see this script.

#!/bin/sh
#set -x
#
# script name deployplugin.sh
# -----------------------------
# 1. copy plugin to remote host
# 2. backup nrpe.cfg on remote host
# 3. append required lines to nrpe.cfg on remote host
# 4. restart cswnrpe service on remote host

# -- error subroutine
err() {
        echo "\n ERROR: $* \n"
        exit 1
}

# variables
backuptime=`date +%m-%d-%Y.%Hh%Mm%Ss` #time of nrpe.cfg backup
nrpedir="/opt/csw/etc"
nrpefile=nrpe.cfg
# command to backup nrpe.cfg
backupnrpe="cp -p ${nrpedir}/${nrpefile} ${nrpedir}/.${nrpefile}.${backuptime}"
plugin_dest_dir="/opt/csw/libexec/nagios-plugins"       # location of plugin
plugin_src_dir="/etc/master/nagios.plugin/zpool_status"
plugin=zpoolhealth.sh

# list of hosts 
hostlist='
host-1
host-2
host-3
host-etc'

for host in ${hostlist}
do
        fping -q ${host} # -q = quiet

        if [ $? -eq 0 ]; then

                echo " --- OK --- Host ${host} is reachable, proceed."

                # check existance of directory /opt/csw/etc
                [ `ssh ${host} file ${nrpedir} | awk -F: '{print $2}'` = "directory" ] || \
                err "\n Directory ${nrpedir} doesn't exist."

                # check existence of file nrpe.cfg
                [ "`ssh ${host} file ${nrpedir}/${nrpefile} | awk '{print $2, $3}'`" = "ascii text" ] || \
                err "\n File ${nrpefile} doesn't exist."

                # on remote host, check existence of destination directory for plugins
                [ `ssh ${host} file ${plugin_dest_dir}  | awk -F: '{print $2}'` = "directory" ] || \
                err "\n Directory ${plugindir} doesn't exist."

                # copy plugin to remote host
                scp -p ${plugin_src_dir}/${plugin} ${host}:${plugin_dest_dir}/${plugin} || \
                err "\n ${plugin_src_dir}/${plugin} can't be copied to ${host}"

                # backup nrpe.cfg on remote host
                ssh ${host} ${backupnrpe} || \
                err "\n nrpe.cfg can't be backup-ed on ${host}"

                # add lines to the end ($) of nrpe.cfg
                # get file, append lines and create temp in mgmt.dc/tmp/
                ssh ${host} cat ${nrpedir}/${nrpefile} | \
                sed '$a\
# check zpool status' > /tmp/${nrpefile}.${host} ||  err "\n Can't append first line"
                cat /tmp/${nrpefile}.${host} | sed '$a\
command[check_zpool_status]=/opt/csw/libexec/nagios-plugins/zpoolhealth.sh' > /tmp/${nrpefile}.${host}.$$ || \
                err "\n Can't append second line"
                mv /tmp/${nrpefile}.${host}.$$ /tmp/${nrpefile}.${host}

                # copy tmp file to remote host
                scp -p /tmp/${nrpefile}.${host} ${host}:${nrpedir}/${nrpefile} || \
                err "\n Can't copy temp file /tmp/${nrpefile}.${host} to ${host}"
                # remove temp file
                rm /tmp/${nrpefile}.${host} || err "\n Can't remove temp file /tmp/${nrpefile}.${host}"

                # reset cswnrpe service
                ssh ${host} svcadm restart cswnrpe || \
                err "\n The cswnrpe service can't be restarted"
                sleep 3
                [ "`ssh ${host} svcs -H cswnrpe | awk '{print $1}'`" = "online" ] || \
                echo "The cswnrpe service on ${host} is not online - check this later"

        else
                echo " ???????  Host ${host} is not reachable - check needed !"
        fi
done

exit 0

Back to the main page