Back to the main page

Nagios plugin for ZFS usage

Majority people use one of many publicly available Nagios plugins for disk (or file system) space usage.
Me too, and if you use some graphic tool you have nice graph like this one, where you see total size of file system, size of data and free space.


But this is nice if you monitor, like in my case, a UFS file system. What if you monitor a ZFS that have snapshots?
Then your graph is little strange. The picture explains everything.


Okay not big deal, but I have developed the plugin that checks ZFS usage and produces more understandable graph.
The plugin uses some ZFS properties from ZPOOL version 13 and higher, so exits if this is not true.

Here is the plugin, place this on remote host (example: /opt/csw/libexec/nagios-plugins/check_zfs_usage.sh)

#!/bin/sh
#set -x
# script for checking disk usage on ZFS
# requires min zpool version 13 or zfs version 4
# example, it's posible to have zfs ver 1 on zpool ver 15 (script support this)

# ------------ Variables
PROGNAME=`/usr/bin/basename $0`

# ------ Nagios plugin return values
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4

# ------------ Subroutines

# Program usage
usage() {
echo " \
Usage
        ${PROGNAME} /zfs warn crit

Note:
1. ZFS filesystem must start with /
2. warn is warning free space in %
3. crit is critical free space in %
example: /var 20 10
"
}

# End script with output, with performance data for NagiosGraph
endscript () {
        echo "${RESULT}"
        exit ${EXIT_STATUS}
}

# ------------ check if there are 3 arguments
if [ $# != 3 ]; then
        usage
        exit 3
fi

# --------- check if warning is bigger than critical size
if [ $3 -ge $2 ]; then
        echo "Warning[%] must be bigger than Critical[%]"
        exit 3
fi

# ----------- check if first argument is a filesystem
FS=`df -n $1`
if [ $? != 0 ]; then
        echo "The $1 is not valid filesystem"
        exit 3
fi

# ----------- check if filesystem is ZFS
# /var : zfs
# /export/atlant-dbbackup: zfs
# note - comment out: ZFS=`echo ${FS} | awk '{print $3}'`
ZFS=`echo ${FS} | awk -F: '{print $2}'`
if [ ${ZFS} -ne zfs ]; then
        echo "The $1 is not ZFS"
        exit 3
fi

# -------- get dataset of filesystem
DATASET=`df -h $1 | grep -v Filesystem | awk '{print $1}'`

# ----------- check if ZFS is min required version 4 or ZPOOL min required ver 13
ZFSVER=`zfs get -H version ${DATASET} | awk '{print $3}'`
if [ $? -ne 0 ]; then
        echo "The ZFS version can't be determined, it's probably less then 4"
        exit 3
fi
if [ ${ZFSVER} -lt 4 ]; then
        #echo "The $1 is indeed ZFS, but version ${ZFSVER} which is less than 4 and not supported by this script"
        #exit 3
        # ---------- check if ZPOOL is min required version 13, or higher
        ZPOOLVER=`zpool upgrade | head -1 | awk '{print $NF}' | awk -F. '{print $1}'`
        if [ ${ZPOOLVER} -lt 13 ]; then
                echo "The script can't support zpool ver ${ZPOOLVER} (<13) and ZFS ver ${ZFSVER} (<4)"
                exit 3
        fi
fi


# size in bytes
QUOTA=`zfs get -Hp quota ${DATASET} | awk '{print $3}'`
# --- check if there is quota at all
if [ ${QUOTA} -eq 0 ]; then
        echo "There is no quota on zfs dataset ${DATASET}"
        exit 3
fi

# --- check if zfs properties can be determined
# --- sometimes even zfs ver =4 this is not posible
for i in usedbydataset usedbychildren usedbysnapshots
do
        if [ "`zfs get -Hp ${i} ${DATASET} | awk '{print $3}'`" = "-" ]; then
                echo "Somehow zfs property ${i} cannot be determined"
                exit 3
        fi
done

# --- check if usedbydataset is not 0
# --- can happens with export/import zpools
if [ `zfs get -Hp usedbydataset ${DATASET} | awk '{print $3}'` -eq 0 ]; then
        echo "Somehow zfs property usedbydataset=0, probably zpool exported/imported and script can't support it"
        exit 3
fi

CHILDRENUSE=`zfs get -Hp usedbychildren ${DATASET} | awk '{print $3}'`
DATA=`zfs get -Hp usedbydataset ${DATASET} | awk '{print $3}'`
SNAPSHOT=`zfs get -Hp usedbysnapshots ${DATASET} | awk '{print $3}'`

# size in Mbytes
QUOTA=`(echo "scale=2; ${QUOTA}/1024/1024" | bc -l)`
CHILDRENUSE=`(echo "scale=2; ${CHILDRENUSE}/1024/1024" | bc -l)`
DATA=`(echo "scale=2; ${DATA}/1024/1024" | bc -l)`
SNAPSHOT=`(echo "scale=2; ${SNAPSHOT}/1024/1024" | bc -l)`

# real quota is actually quota-usedbychildren
QUOTA=`(echo "scale=2; ${QUOTA}-${CHILDRENUSE}" | bc -l)`

FREE=`(echo "${QUOTA}-${DATA}-${SNAPSHOT}" | bc -l)`
FREEPERC=`bc -l << E
scale=2
${FREE}*100/${QUOTA}
E`

WARNING=$2
CRITICAL=$3

if [ ${FREEPERC} -gt ${WARNING} ]
then
        RESULT="ZFS ver${ZFSVER} $1 OK Free space ${FREE}MB ${FREEPERC}% : ${QUOTA}, ${SNAPSHOT}, ${DATA}, ${FREE}"
        EXIT_STATUS=${STATE_OK}
elif [ ${FREEPERC} -le ${WARNING} ] && [ ${FREEPERC} -gt ${CRITICAL} ]
then
        RESULT="ZFS ver${ZFSVER} $1 WARNING Free space ${FREE}MB ${FREEPERC}% : ${QUOTA}, ${SNAPSHOT}, ${DATA}, ${FREE}"
        EXIT_STATUS=${STATE_WARNING}
else
        RESULT="ZFS ver${ZFSVER} $1 CRITICAL Free space ${FREE}MB ${FREEPERC}% : ${QUOTA}, ${SNAPSHOT}, ${DATA}, ${FREE}"
        EXIT_STATUS=${STATE_CRITICAL}
fi

# ------- provide output and nagios return value
endscript

You need to define new Nagios command on your Nagios machine (example: in /etc/nagios/COMMON/commands.cfg)

# check usage of ZFS
# example: on local host add line to /opt/csw/etc/nrpe.cfg
# command[check_zfs_usage]=/opt/csw/libexec/nagios-plugins/check_zfs_usage.sh
define command{
        command_name    check-nrpe-zfs-usage
        command_line    $USER1$/check_nrpe -H $HOSTADDRESS$ -t 30 -c check_zfs_usage!$ARG1$!$ARG2$!$ARG3$
        }

Also define new Nagios service on Nagios machine (example in /etc/nagios/UNIX/service-DISK-ZFS.cfg)

define service{
        # check ZFS usage
        use                             gen-service
        host_name                       unixlab
        service_description             ZFS-/
        check_command                   check-nrpe-zfs-usage!/!20!10
        }

And configure NRPE (on remote host) to use plugin and check ZFS usage.
(example: /opt/csw/etc/nrpe.cfg)

# check usage of ZFS
command[check_zfs_usage]=/opt/csw/libexec/nagios-plugins/check_zfs_usage.sh $ARG1$ $ARG2$ $ARG3$

I use NETWAYS Nagios Grapher v1.7.1 for having graphs, so here is graph configuration on Nagios machine.
(example /etc/nagios/ngraph.d/check_zfs_usage.ncfg)

# ---------- Help ------------------------------------
# service_name =
#       regular expresion used to identify service
#
# graph_log_regex =
#       regular expresion used to find searched value in performance data
#       must be in round brackets ()
#
# graph_value = variable name in rrd database, no empty space
#
# graph_units = units on Y axis, X axis is time
#
# graph_legend = it contains key for variable, shows under graph
#
# page = optional
#
# rrd_plottype = LINE1 is simple line, AREA is filled out surface
#
# -----------------------------------------------

# example of plugin output
# ZFS ver4 / OK Free space 1911.93MB 23.33% : 8192.00, 2048.66, 4231.41, 1911.93

define ngraph{
        service_name            ZFS
        graph_log_regex         \d*\.\d*, \d*\.\d*, \d*\.\d*, (\d*\.\d*)
        graph_value             free
        graph_units             MB
        #graph_legend           Free space
        rrd_plottype            AREA
        rrd_color               00FFFF # cyan
        hide                    yes
        graph_lower_limit       0
}

define ngraph{
        service_name            ZFS
        type                    GPRINT
        print_source            free # source is graph_value previously defined
        print_description       Free disk space:
        print_function          LAST # returns most recent update of RRA (round robin archive)
        print_format            %11.2lf MB
        print_eol               left # start next GPRINT in new row
}

define ngraph{
        service_name            ZFS
        graph_log_regex         \d*\.\d*, \d*\.\d*, (\d*\.\d*), \d*\.\d*
        graph_value             data
        graph_units             MB
        graph_legend            ZFS data
        graph_lower_limit       0
        rrd_plottype            AREA
        rrd_color               008000 # green
}

define ngraph{
        service_name            ZFS
        type                    GPRINT
        print_source            data
        print_description       Latest:
        print_function          LAST # returns most recent update of RRA (round robin archive)
        print_format            %2.2lf
}

define ngraph{
        service_name            ZFS
        type                    GPRINT
        print_source            data
        print_description       Maximum:
        print_function          MAX  # returns max value of RRA (round robin archive)
        print_format            %2.2lf
        print_eol               left # start next GPRINT in new row
}

define ngraph{
        service_name            ZFS
        graph_log_regex         \d*\.\d*, (\d*\.\d*), \d*\.\d*, \d*\.\d*
        graph_value             snapshot
        graph_units             MB
        graph_legend            ZFS snapshot
        graph_lower_limit       0
        rrd_plottype            STACK # place new value (snapshot) on top of previous (data)
        rrd_color               C0C0C0 # silver
}

define ngraph{
        service_name            ZFS
        type                    GPRINT
        print_source            snapshot
        print_description       Latest:
        print_function          LAST
        print_format            %2.2lf
}

define ngraph{
        service_name            ZFS
        type                    GPRINT
        print_source            snapshot
        print_description       Maximum:
        print_function          MAX
        print_format            %2.2lf
        print_eol               left # start next GPRINT in new row
}

define ngraph{
        service_name            ZFS
        graph_log_regex         : (\d*\.\d*), \d*\.\d*, \d*\.\d*, \d*\.\d*
        graph_value             quota
        graph_units             MB
        graph_legend            ZFS quota
        graph_lower_limit       0
        rrd_plottype            LINE1
        rrd_color               FF0000 # red
}

define ngraph{
        service_name            ZFS
        type                    GPRINT
        print_source            quota
        print_description       Latest:
        print_function          LAST
        print_format            %2.2lf
}

define ngraph{
        service_name            ZFS
        type                    GPRINT
        print_source            quota
        print_description       Maximum:
        print_function          MAX
        print_format            %2.2lf
        print_eol               left # start next GPRINT in new row
}

And finally here is nice graph.


Back to the main page