Back to the main page

Nagios plugin to monitor ZFS appliance share usage

Intro

This is Nagios plugin to query directly a ZFS appliance and check a share usage, instead of query some host and check usage of a NFS mount.

User

Traditionally, Nagios uses a client (nrpe) to monitor local services (like disk usage check). In this case, there is no nrpe for ZFS appliance, so Nagios will have to SSH into the appliance and do the work, hence 'nagios' user need to be present on ZFS appliance.
Create new read-only role for this account and add SSH public key for nagios user.

(zfs app): configuration users nagios> show

Properties:
                      logname = nagios
                          type = local
                           uid = 2000000000
                      fullname = nagios
              initial_password = *************
            require_annotation = false
                         roles = ro
                    kiosk_mode = false
                  kiosk_screen = status/dashboard
 

(zfs app):configuration users nagios preferences keys> show

Keys:
NAME     MODIFIED              TYPE   COMMENT
key-000  2015-3-2 17:51:31     RSA    nagios_rsa_public_key

Plugin

Maybe your Nagios plugins are installed in the directory /usr/lib/nagios/plugins and this one can be named check_share_space_zfs_appliance.ksh
The plugin requires six arguments: zfsapp pool project share war_free_% crit_free_%

#!/bin/ksh
#set -x
# -----------------------------------------------------------------
# Zarko, nagios plugin to check disk space on zfs storage appliance
# 4-7-2017 : zd: Add support for a pool
# 4-11-2017 : zd: Add support for a snapshot
# -----------------------------------------------------------------

PROGNAME=`/bin/basename $0`

# ------ Nagios plugin return values
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4

# usage function
usage() {
echo " \
Usage
        ${PROGNAME} <zfsapp> <pool> <project> <share> <war_free_%> <crit_free_%>
"
}

# End script with output, with performance data for pnp4nagios
endscript () {
        echo ${RESULT}
        exit ${EXIT_STATUS}
}

# check if there are 6 arguments
if [ $# != 6 ]; then
        usage ; exit 3
fi

# temp file and cleanup of same
tmp_file=/tmp/${PROGNAME}.tmp

# --- CLEANING SUBROUTINE
tmp_file_cleaning () {
        [ -f ${tmp_file}.$$ ] && rm ${tmp_file}.$$
}

# --- cleaning in case of script termination and regular exit
trap tmp_file_cleaning HUP INT QUIT ABRT EXIT

ssh -i /var/log/nagios/.ssh/id_rsa nagios@$1 <<EOF > ${tmp_file}.$$
shares
set pool=$2
select $3
select $4
show
EOF

# Calculation is done in Mega

# Get quota
share_quota=`grep "quota =" ${tmp_file}.$$ | awk '{print $3}'`
if [ ${share_quota: -1} = G ]; then
  share_quota=`(echo "scale=2; ${share_quota%?}*1024" | bc -l)`
elif [ ${share_quota: -1} = T ]; then
  share_quota=`(echo "scale=2; ${share_quota%?}*1024*1024" | bc -l)`
else
  share_quota=`echo ${share_quota%?}`
fi

# Get available space
share_space_available=`grep "space_available" ${tmp_file}.$$ | awk '{print $3}'`
if [ ${share_space_available: -1} = G ]; then
  share_space_available=`(echo "scale=2; ${share_space_available%?}*1024" | bc -l)`
elif [ ${share_space_available: -1} = T ]; then
  share_space_available=`(echo "scale=2; ${share_space_available%?}*1024*1024" | bc -l)`
else
  share_space_available=`echo ${share_space_available%?}`
fi

# Get space used by data
share_space_data=`grep "space_data" ${tmp_file}.$$ | awk '{print $3}'`
if [ ${share_space_data: -1} = G ]; then
  share_space_data=`(echo "scale=2; ${share_space_data%?}*1024" | bc -l)`
elif [ ${share_space_data: -1} = T ]; then
  share_space_data=`(echo "scale=2; ${share_space_data%?}*1024*1024" | bc -l)`
else
  share_space_data=`echo ${share_space_data%?}`
fi

# Get space used by snapshots
share_space_snapshots=`grep "space_snapshots" ${tmp_file}.$$ | awk '{print $3}'`
if [ ${share_space_snapshots} = 0 ]; then
  share_space_snapshots=0
elif [ ${share_space_snapshots: -1} = G ]; then
  share_space_snapshots=`(echo "scale=2; ${share_space_snapshots%?}*1024" | bc -l)`
elif [ ${share_space_snapshots: -1} = T ]; then
  share_space_snapshots=`(echo "scale=2; ${share_space_snapshots%?}*1024*1024" | bc -l)`
else
  share_space_snapshots=`echo ${share_space_snapshots%?}`
fi

# Total space is space used by data and snapshots
share_space_total=`(echo "scale=2; ${share_space_data}+${share_space_snapshots}" | bc -l)`

# warn and crit arguments are in %
WARNING=$5
CRITICAL=$6
# Warning size is $warn_size_free
# Critical size is $crit_size_free
warn_used=`(echo "scale=2; (100-${WARNING})*${share_quota}/100" | bc -l)`
warn_free=`(echo "scale=2; ${WARNING}*${share_quota}/100" | bc -l)`
crit_used=`(echo "scale=2; (100-${CRITICAL})*${share_quota}/100" | bc -l)`
crit_free=`(echo "scale=2; ${CRITICAL}*${share_quota}/100" | bc -l)`

if [ ${share_space_available} -gt ${warn_free} ]
then
  RESULT="$2/$3/$4 OK - Free space ${share_space_available}MB | $2/$3/$4=${share_space_total}MB;${warn_used};${crit_used};0;${share_quota}"
  EXIT_STATUS=${STATE_OK}
elif [ ${share_space_available} -le ${warn_free} ] && [ ${share_space_available} -gt ${crit_free} ]
then
  RESULT="$2/$3/$4 WARNING - Free space ${share_space_available}MB | $2/$3/$4=${share_space_total}MB;${warn_used};${crit_used};0;${share_quota}"
  EXIT_STATUS=${STATE_WARNING}
else
  RESULT="$2/$3/$4 CRITICAL - Free space ${share_space_available}MB | $2/$3/$4=${share_space_total}MB;${warn_used};${crit_used};0;${share_quota}"
  EXIT_STATUS=${STATE_CRITICAL}
fi

# ------- provide output and nagios return value
endscript

Graph

The pnp4nagios template is /usr/share/pnp4nagios/templates/check_zfsappliance_share_usage.php

<?php
foreach ($this->DS as $KEY=>$VAL) {
# set initial values
        $fmt = '%7.3lf';
        $pct = '';
        $upper = "";
        $maximum = "";
        $divis = 1;
        $return = '\n';
        $unit = "B";
        $label = $unit;
        if ($VAL['UNIT'] != "") {
                $unit = $VAL['UNIT'];
                $label = $unit;
                if ($VAL['UNIT'] == "%%") {
                        $label = '%';
                        $fmt = '%5.1lf';
                        $pct = '%';
                }
        }
        if ($VAL['MAX'] != "") {
                # adjust value and unit, details in .../helpers/pnp.php
                $max = pnp::adjust_unit( $VAL['MAX'].$unit,1024,$fmt );
                $upper = "-u $max[1] ";
                $maximum = "of $max[1] $max[2]$pct used";
                $label = $max[2];
                $divis = $max[3];
                $return = '';
        }

        # dont wanna replace _ with / ????
        $ds_name[$KEY] = str_replace("_","/",$VAL['NAME']);

        # set graph labels
        $opt[$KEY]     = "--vertical-label $label -l 0 $upper --title \"Filesystem $ds_name[$KEY]\" ";

        # Graph Definitions
        $def[$KEY]     = rrd::def( "var1", $VAL['RRDFILE'], $VAL['DS'], "AVERAGE" );
        # "normalize" graph values
        $def[$KEY]    .= rrd::cdef( "v_n","var1,$divis,/");
        $def[$KEY]    .= rrd::area( "v_n", "#c6c6c6",  $ds_name[$KEY] );
        $def[$KEY]    .= rrd::line1( "v_n", "#003300" );
        # show values in legend
        $def[$KEY]    .= rrd::gprint( "v_n", "LAST", "$fmt $label$pct $maximum ");
        $def[$KEY]    .= rrd::gprint( "v_n", "AVERAGE", "$fmt $label$pct avg used $return");
        # create max line and legend
        if ($VAL['MAX'] != "") {
                $def[$KEY] .= rrd::gprint( "v_n", "MAX", "$fmt $label$pct max used \\n" );
                $def[$KEY] .= rrd::hrule( $max[1], "#003300", "Size of FS  $max[0] \\n");
        }
        # create warning line and legend
        if ($VAL['WARN'] != "") {
                $warn = pnp::adjust_unit( $VAL['WARN'].$unit,1024,$fmt );
                $def[$KEY] .= rrd::hrule( $warn[1], "#ffff00", "Warning  on $warn[0] \\n" );
        }
        # create critical line and legend
        if ($VAL['CRIT'] != "") {
                $crit = pnp::adjust_unit( $VAL['CRIT'].$unit,1024,$fmt );
                $def[$KEY] .= rrd::hrule( $crit[1], "#ff0000", "Critical on $crit[0]\\n" );
        }
}
?>

Add new share for monitoring

Nagios command is:

# check usage of ZFS Appliance shares
define command{
   command_name check_zfsappliance_share_usage
   command_line $USER1$/check_share_space_zfs_appliance.ksh $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$
   }
Add new service to Nagios, ex.

define service{
        use                     generic-service,srv-pnp
        host_name               zfs-appliance-hostname
        service_description     Disk usage pool-name/project/share-name
        check_command           check_zfsappliance_share_usage!$HOSTADDRESS$!pool-name!project!share-name!20!10
        normal_check_interval   15
        }
Back to the main page