Update Munin plugin example

This commit is contained in:
2023-02-16 16:06:00 -05:00
parent 3bd93563e6
commit eda1b95d5f
2 changed files with 63 additions and 102 deletions

View File

@ -7,23 +7,6 @@
pvc - Plugin to monitor a PVC cluster.
=head1 CONFIGURATION
Note that due to how Munin thresholds work, these values must always be slightly less than 1 or 2 respectively,
or the alerts will never be triggered.
Defaults (no config required):
[pvc]
env.warning 1.99
env.critical 1.99
Make degraded cluster WARN only (max value is 2, so 3 effectively disables):
[pvc]
env.pvc_cluster_warning 1.99
env.pvc_cluster_critical 3
=head1 AUTHOR
Joshua Boniface <joshua@boniface.me>
@ -45,24 +28,14 @@ GPLv3
. "$MUNIN_LIBDIR/plugins/plugin.sh"
warning=1.99
critical=1.99
warning=1
critical=2
export PVC_CLIENT_DIR="/run/shm/munin-pvc"
PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
JQ_CMD="/usr/bin/jq"
output_usage() {
echo "This plugin outputs numerical values based on the health of the PVC cluster."
echo
echo "There are separate outputs for both the PVC cluster itself as well as the Ceph storage cluster."
echo "In normal operation, i.e. when both clusters are in 'Optimal' state, the plugin returns 0 for"
echo "each cluster. When the cluster is placed into 'Maintenance' mode,the plugin returns 1 for each"
echo "cluster, and goes into WARN state (limit 0.99); this can be adjusted by overriding the WARNING"
echo "threshold of the plugin to something other than 0.99 - note that due to Munin's alerting design,"
echo "the warning value must always be very slightly below the whole number. When either cluster"
echo "element becomes 'Degraded', the plugin returns 2 for the relevant cluster, which is treated as a"
echo "critical. Like the WARNING threshold, this can be overridden, and with the same caveat about limit."
exit 0
}
@ -84,72 +57,73 @@ output_autoconf() {
}
output_config() {
echo 'graph_title PVC Clusters'
echo 'graph_args --base 1000'
echo 'graph_title PVC CHealth'
echo 'graph_args --base 100'
echo 'graph_vlabel Count'
echo 'graph_category pvc'
echo 'graph_period second'
echo 'graph_info This graph shows the nodes in the PVC cluster.'
echo 'graph_info These graphs show the health of the PVC cluster and specific node.'
echo 'pvc_cluster.label Cluster Degradation'
echo 'pvc_cluster.label Cluster Health'
echo 'pvc_cluster.type GAUGE'
echo 'pvc_cluster.max 2'
echo 'pvc_cluster.info Whether the PVC cluster is in a degraded state.'
print_warning pvc_cluster
print_critical pvc_cluster
echo 'pvc_cluster.max 100'
echo 'pvc_cluster.info Health of the PVC cluster in %.'
echo 'pvc_storage.label Storage Degradation'
echo 'pvc_storage.type GAUGE'
echo 'pvc_storage.max 2'
echo 'pvc_storage.info Whether the storage cluster is in a degraded state.'
print_warning pvc_storage
print_critical pvc_storage
echo 'pvc_cluster_alert.label Cluster Health State'
echo 'pvc_cluster_alert.type GAUGE'
echo 'pvc_cluster_alert.max 2',
echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health'
print_warning pvc_cluster_alert
print_critical pvc_cluster_alert
echo 'pvc_node.label Node Health'
echo 'pvc_node.type GAUGE'
echo 'pvc_node.max 100'
echo 'pvc_node.info Health of the PVC node in %.'
echo 'pvc_node_alert.label Node Health State'
echo 'pvc_node_alert.type GAUGE'
echo 'pvc_node_alert.max 2',
echo 'pvc_node_alert.info Alerting state of the PVC node health'
print_warning pvc_node_alert
print_critical pvc_node_alert
exit 0
}
output_values() {
PVC_OUTPUT="$( $PVC_CMD )"
HOST="$( hostname --short )"
cluster_health="$( $JQ_CMD '.health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
cluster_failed_reason="$( $JQ_CMD -r '.health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
case $cluster_health in
"Optimal")
cluster_value="0"
;;
"Maintenance")
cluster_value="1"
;;
"Degraded")
cluster_value="2"
esac
in_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )"
storage_health="$( $JQ_CMD '.storage_health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
storage_failed_reason="$( $JQ_CMD -r '.storage_health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
case $storage_health in
"Optimal")
storage_value="0"
;;
"Maintenance")
storage_value="1"
;;
"Degraded")
storage_value="2"
esac
cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
echo "pvc_cluster.value ${cluster_health}"
echo "pvc_cluster.extinfo ${cluster_health_messages}"
if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then
cluster_health_alert=2
elif [[ ${cluster_health} -le 90 && ${is_maintenance} == "false" ]]; then
cluster_health_alert=1
else
cluster_health_alert=0
fi
echo "pvc_cluster_alert.value ${cluster_health_alert}"
echo "pvc_cluster.value $cluster_value"
if [[ $cluster_value -eq 1 ]]; then
echo "pvc_cluster.extinfo Cluster in maintenance mode"
elif [[ $cluster_value -eq 2 ]]; then
echo "pvc_cluster.extinfo ${cluster_failed_reason}"
fi
echo "pvc_storage.value $storage_value"
if [[ $storage_value -eq 1 ]]; then
echo "pvc_storage.extinfo Cluster in maintenance mode"
elif [[ $storage_value -eq 2 ]]; then
echo "pvc_storage.extinfo ${storage_failed_reason}"
fi
node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
echo "pvc_node.value ${node_health}"
echo "pvc_node.extinfo ${node_health_messages}"
if [[ ${node_health} -le 50 && ${is_maintenance} == "false" ]]; then
node_health_alert=2
elif [[ ${node_health} -le 90 && ${is_maintenance} == "false" ]]; then
node_health_alert=1
else
node_health_alert=0
fi
echo "pvc_node_alert.value ${node_health_alert}"
}
case $# in