#!/usr/bin/bash

PROGNAME=${0##*/}

function nagios_exit_usage() {
    echo "\
Usage: $PROGNAME [OPTION...]
Nagios plugin to check Linux RAID status

Common options:
  -I, --include       REGEX   Include raid device pattern
  -E, --exclude       REGEX   Exclude raid device pattern
  -S, --sudo-user     USER    Run cat /proc/mdstat with sudo -u USER
  -h, --help                  Display this help

SSH options:
  -s, --ssh           CMD     Custom SSH command, see documentation below
  -H, --host          HOST    SSH host
  -p, --port          INT     SSH port
  -u, --ssh-user      USER    SSH username
  -i, --ssh-identity  FILE    SSH identity key file
  -t, --timeout       INT     SSH connect timeout

SNMP options:
  -H, --host          HOST    SNMP host
  -p, --port          INT     SNMP port
  -t, --timeout       INT     Alias for SNMP option -t (timeout)
  -O, --snmp-oid      OID     OID that return result in get_proc_mdstat format
  -P, --snmp-options  ARGS    SNMP options, eg: -v 2c -c public, end with --

When using option -s, --ssh, classic SSH options get ignored. It is then
up to the caller to properly escape args and handle SSH options like
identity, port, user, connect timeout.
Usage example: check_mdstat -s 'ssh -i key cmd@bastion host' ...

When using option -O, --snmp-oid, /proc/mdstat output is retrieved from
the given OID.
"
    exit 3
}

function nagopt_var() {
    [[ -z $2 || $2 == '$' ]] && return
    declare -g "$1=$2"
}

function nagopt_arglist() {
    local var=$1; shift
    local value=()
    while (( $# > 0 )) && [[ $1 != -- ]]; do
        value+=( "$1" )
        shift
    done
    declare -ga "$var=( \"\${value[@]}\" )"
}

function get_proc_mdstat() {
    if [[ -n $SNMP_OID && -n $HOST ]]; then
        snmpget -OqevU "${SNMP_OPTIONS[@]}" ${TIMEOUT:+-t "$TIMEOUT"} \
            "${HOST}${PORT:+:$PORT}" "$SNMP_OID" |
                tr -d '"' |grep -Fv 'No Such Instance' || return 2
    else
        cmd=( ${SUDO_USER:+sudo -n -u "$SUDO_USER"} cat /proc/mdstat )
        printf -v sshargs "%q " "${cmd[@]}"
        if [[ -n $SSH_CMD ]]; then
            # custom ssh command: it is up to the caller to properly escape args
            # and handle ssh options like identity, port, user, connect timeout
            cmd=( $SSH_CMD -- "$sshargs" )
        elif [[ -n $HOST ]]; then
            cmd=(
                ssh ${SSH_IDENTITY:+-i "$SSH_IDENTITY"} ${SSH_USER:+-l "$SSH_USER"}
                ${PORT:+-p "$PORT"} -q -C -T -o StrictHostKeyChecking=no
                -o UserKnownHostsFile=/dev/null -o PreferredAuthentications=publickey
                ${TIMEOUT:+-o "ConnectTimeout=$TIMEOUT"} "$HOST" -- "$sshargs"
            )
        fi
        "${cmd[@]}"
    fi
}

HOST=
PORT=
SSH_USER=
SSH_IDENTITY=
TIMEOUT=
SUDO_USER=
SNMP_OID=
SNMP_OPTIONS=()
INCLUDE=^
EXCLUDE=

while (( $# > 0 )); do
    case "$1" in
        # ssh, snmp
        -H|--host) nagopt_var HOST "$2"; shift ;;
        -p|--port) nagopt_var PORT "$2"; shift ;;
        -t|--timeout) nagopt_var TIMEOUT "$2"; shift ;;
        # ssh
        -s|--ssh) nagopt_var SSH_CMD "$2"; shift ;;
        -u|--ssh-user) nagopt_var SSH_USER "$2"; shift ;;
        -i|--ssh-identity) nagopt_var SSH_IDENTITY "$2"; shift ;;
        # snmp
        -O|--snmp-oid) nagopt_var SNMP_OID "$2"; shift ;;
        -P|--snmp-options) nagopt_arglist SNMP_OPTIONS "${@:2}"; shift $((${#SNMP_OPTIONS[@]}+1)) ;;
        # common
        -I|--include) nagopt_var INCLUDE "$2"; shift ;;
        -E|--exclude) nagopt_var EXCLUDE "$2"; shift ;;
        -S|--sudo-user) nagopt_var SUDO_USER "$2"; shift ;;
        -h|--help) nagios_exit_usage ;;
        *) nagios_exit_usage ;;
    esac
    shift
done

REPLY=$(get_proc_mdstat)
ret=$?

if [[ ( -n $SSH_CMD || (-n $HOST && -z $SNMP_OID) ) ]] && (( ret == 255 )); then
    echo 'UNKNOWN: SSH connect failed'
    exit 3
fi
if (( ret != 0 )); then
    echo 'UNKNOWN: Failed to get /proc/mdstat data'
    exit 3
fi

echo "$REPLY" |awk \
    -v "INCLUDE=$INCLUDE" \
    -v "EXCLUDE=$EXCLUDE" \
'
$1 == "" { parse = 0; }
parse == 0 && /^md[0-9]+ : / {
    md = $1;
    if ((INCLUDE != "" && !match(md, INCLUDE)) ||
        (EXCLUDE != "" && match(md, EXCLUDE)))
        next;
    all_mds[md] = 1;
    array_state[md] = $3;
    parse = 1;
}
parse == 1 && match($(NF-1), /^\[([0-9]+)\/([0-9]+)\]$/, cap) {
    devices_total[md] = cap[1];
    devices_in_use[md] = cap[2];
    parse = 2;
}
parse == 2 && match($0, /[ ]+([^ ]+)[ ]+=[ ]+([0-9.]+)% [^ ]+ finish=([^ ]+) speed=([^ ]+)/, cap) {
    action[md] = cap[1];
    percent[md] = cap[2];
    finish[md] = cap[3];
    speed[md] = cap[4];
    parse = 0
}

function increase_prio(current, target) {
    return (NAGIOS_STATUS_PRIO_TR[target] > NAGIOS_STATUS_PRIO_TR[current]) \
        ? target : current;
}

function get_md_prio(md, _prio) {
    _prio = 0
    if (devices_in_use[md] < devices_total[md])
        _prio = _prio + 10;
    if (action[md] != "")
        _prio = _prio + 10;
    if (array_state[md] != "active")
        _prio = _prio + 8;
    return _prio;
}

function cmp_mds(i1,v1,i2,v2, _p1,_p2) {
    _p1 = get_md_prio(i1);
    _p2 = get_md_prio(i2);
    if (_p1 > _p2) return -1;
    else if (_p1 < _p2) return 1;
    if (i1 < i2) return -1;
    else if (i1 > i2) return 1;
    else return 0
}

END {
    NAGIOS_STATUS_TEXT[0] = "OK";
    NAGIOS_STATUS_TEXT[1] = "WARNING";
    NAGIOS_STATUS_TEXT[2] = "CRITICAL";
    NAGIOS_STATUS_TEXT[3] = "UNKNOWN";
    NAGIOS_STATUS_PRIO_TR[0] = 0;
    NAGIOS_STATUS_PRIO_TR[1] = 2;
    NAGIOS_STATUS_PRIO_TR[2] = 3;
    NAGIOS_STATUS_PRIO_TR[3] = 1;
    status = 0;
    output_err = "";
    output = "";
    perfdata = "";

    PROCINFO["sorted_in"] = "cmp_mds";
    for (md in all_mds) {
        cur_status = 0;
        cur_output = "";
        if (array_state[md] != "active") {
            cur_output = array_state[md];
            cur_status = increase_prio(cur_status, 2);
        }
        if (devices_in_use[md] != "" && devices_total[md] != "") {
            cur_output = (cur_output == "" ? "" : cur_output " ") \
                devices_in_use[md] "/" devices_total[md] " disks";
            if (devices_in_use[md] < devices_total[md])
                cur_status = increase_prio(cur_status, 2);
        }
        if (action[md] != "") {
            cur_status = increase_prio(cur_status, 1);
            cur_output = cur_output " " action[md];
            if (percent[md] != "")
                cur_output = cur_output " " percent[md] "%";
            if (finish[md] != "")
                cur_output = cur_output " (" finish[md] ")";
            if (speed[md] != "")
                cur_output = cur_output " " speed[md];
        }

        err_flag = (cur_status == 0) ? "" : "**";
        output = output (output == "" ? "" : ", ") \
            err_flag md err_flag " " cur_output;
        perfdata = perfdata (perfdata == "" ? "" : " ") \
            "disks." md "=" devices_in_use[md] ";;;;" devices_total[md];
        status = increase_prio(status, cur_status);
    }

    if (output == "") {
        status = increase_prio(status, 3);
        output = "No RAID found!";
    }
    print NAGIOS_STATUS_TEXT[status] ": " output "|" perfdata;
    exit(status);
}
'
