Move monitoring folder to top level

2023-12-27 11:37:13 -05:00
parent 431ee69620
commit 494c20263d
10 changed files with 2 additions and 2 deletions
--- a/monitoring/README.md
+++ b/monitoring/README.md
@ -0,0 +1,43 @@
+# PVC Node Monitoring Resources
+
+This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.
+
+## Prometheus + Grafana
+
+The included example Prometheus configuration and Grafana dashboard can be used to query the PVC API for Prometheus data and display it with a consistent dashboard.
+
+Note that the default configuration here also includes Ceph cluster information; a Ceph dashboard can be found externally.
+
+Note too that this does not include node export examples from individual PVC nodes; those must be set up separately.
+
+## Munin
+
+The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
+
+* `pvc`: Checks the PVC cluster and node health, as well as their status (OK/Warning/Critical, based on maintenance status), providing 4 graphs.
+
+* `ceph_utilization`: Checks the Ceph cluster statistics, providing multiple graphs. Note that this plugin is independent of PVC itself, and makes local calls to various Ceph commands itself.
+
+The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. The alerting is provided by two separate graphs from the health graph so that actual health state is logged regardless of alerting.
+
+The `ceph_utilization` plugin provides no configuration; only the cluster utilization graph alerts such that >80% used is warning and >90% used is critical. Ceph itself begins warning above 80% as well.
+
+## CheckMK
+
+The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin. This monitoring server plugin requires CheckMK version 2.0 or higher. The two parts can be installed as follows:
+
+* `pvc`: Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node.
+
+* `pvc.py`: Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory on the CheckMK monitoring host for each monitoring site.
+
+The plugin provides no configuration: the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK.
+
+With both the agent and server plugins installed, you can then run `cmk -II <node>` (or use WATO) to inventory each node, which should produce two new checks:
+
+* `PVC Cluster`: Provides the cluster-wide health. Note that this will be identical for all nodes in the cluster (i.e. if the cluster health drops, all nodes in the cluster will alert this check).
+
+* `PVC Node <shortname>`: Provides the per-node health.
+
+The "Summary" text, shown in the check lists, will be simplistic, only showing the current health percentage.
+
+The "Details" text, found in the specific check details, will show the full list of problem(s) the check finds, as shown by `pvc status` itself.
--- a/monitoring/checkmk/pvc
+++ b/monitoring/checkmk/pvc
@ -0,0 +1,6 @@
+#!/bin/bash
+
+# PVC cluster status check for Check_MK (agent-side)
+
+echo "<<<pvc>>>"
+pvc --quiet status --format json
--- a/monitoring/checkmk/pvc.py
+++ b/monitoring/checkmk/pvc.py
@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+#
+# Check_MK PVC plugin
+#
+# Copyright 2017-2021, Joshua Boniface <joshua@boniface.me>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from .agent_based_api.v1 import *
+from cmk.base.check_api import host_name
+from time import time
+from json import loads
+
+
+def discover_pvc(section):
+    my_node = host_name().split(".")[0]
+    yield Service(item=f"PVC Node {my_node}")
+    yield Service(item="PVC Cluster")
+
+
+def check_pvc(item, params, section):
+    state = State.OK
+    summary = "Stuff"
+    details = None
+    data = loads(" ".join(section[0]))
+    my_node = host_name().split(".")[0]
+
+    maintenance_map = {
+        "true": "on",
+        "false": "off",
+    }
+    maintenance = maintenance_map[data["maintenance"]]
+
+    # Node check
+    if item == f"PVC Node {my_node}":
+        my_node = host_name().split(".")[0]
+        node_health = data["node_health"][my_node]["health"]
+        node_messages = data["node_health"][my_node]["messages"]
+
+        summary = f"Node health is {node_health}% (maintenance {maintenance})"
+
+        if len(node_messages) > 0:
+            details = ", ".join(node_messages)
+
+        if node_health <= 50 and maintenance == "off":
+            state = State.CRIT
+        elif node_health <= 90 and maintenance == "off":
+            state = State.WARN
+        else:
+            state = State.OK
+
+        yield Metric(name="node-health", value=node_health)
+
+    # Cluster check
+    elif item == "PVC Cluster":
+        cluster_health = data["cluster_health"]["health"]
+        cluster_messages = data["cluster_health"]["messages"]
+
+        summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})"
+
+        if len(cluster_messages) > 0:
+            details = ", ".join([m["text"] for m in cluster_messages])
+
+        if cluster_health <= 50 and maintenance == "off":
+            state = State.CRIT
+        elif cluster_health <= 90 and maintenance == "off":
+            state = State.WARN
+        else:
+            state = State.OK
+
+        yield Metric(name="cluster-health", value=cluster_health)
+
+    yield Result(state=state, summary=summary, details=details)
+    return
+
+
+register.check_plugin(
+    name="pvc",
+    service_name="%s",
+    check_ruleset_name="pvc",
+    discovery_function=discover_pvc,
+    check_function=check_pvc,
+    check_default_parameters={},
+)
--- a/monitoring/munin/ceph_utilization
+++ b/monitoring/munin/ceph_utilization
@ -0,0 +1,325 @@
+#!/bin/bash
+# -*- sh -*-
+
+: << =cut
+
+=head1 NAME
+
+ceph_utilization - Plugin to monitor a Ceph cluster's utilization
+
+=head1 CONFIGURATION
+
+Defaults (no config required) for the total utilization thresholds:
+
+[ceph_utilization]
+env.warning 80
+env.critical 90
+
+=head1 AUTHOR
+
+Joshua Boniface <joshua@boniface.me>
+
+=head1 LICENSE
+
+GPLv3
+
+=head1 BUGS
+
+=back
+
+=head1 MAGIC MARKERS
+
+ #%# family=auto
+ #%# capabilities=autoconf
+
+=cut
+
+. "$MUNIN_LIBDIR/plugins/plugin.sh"
+
+is_multigraph
+
+warning=80
+critical=90
+
+RADOSDF_CMD="/usr/bin/sudo /usr/bin/rados df --format json"
+OSDDF_CMD="/usr/bin/sudo /usr/bin/ceph osd df --format json"
+JQ_CMD="/usr/bin/jq"
+
+output_usage() {
+    echo "This plugin outputs information about a Ceph cluster"
+    exit 0
+}
+
+output_autoconf() {
+    $RADOSDF_CMD &>/dev/null
+    radosdf_ret=$?
+    $OSDDF_CMD &>/dev/null
+    osddf_ret=$?
+    $JQ_CMD --version &>/dev/null
+    jq_ret=$?
+
+    if [[ ${radosdf_ret} -eq 0 && ${osddf_ret} -eq 0 && ${jq_ret} -eq 0 ]]; then
+        echo "yes"
+    elif [[ ${radosdf_ret} -ne 0 || ${osddf_ret} -ne 0 ]]; then
+        echo "no (no 'rados' or 'ceph' command found)"
+    elif [[ ${jq_ret} -ne 0 ]]; then
+        echo "no (no 'jq' command found)"
+    else
+        echo "no (general failure)"
+    fi
+}
+
+output_config() {
+    # Graph set 1 - Ceph cluster utilization
+    echo 'multigraph cluster_utilization'
+    echo 'graph_title Cluster Utilization'
+    echo 'graph_args --base 1000'
+    echo 'graph_vlabel % Utilization'
+    echo 'graph_category ceph'
+    echo 'graph_info This graph shows the cluster utilization.'
+
+    echo 'cluster_utilization.label Cluster Utilization'
+    echo 'cluster_utilization.type GAUGE'
+    echo 'cluster_utilization.max 100'
+    echo 'cluster_utilization.info Percentage utilization of the cluster.'
+    print_warning cluster_utilization
+    print_critical cluster_utilization
+
+    # Graph set 2 - Ceph cluster objects
+    echo 'multigraph cluster_objects'
+    echo 'graph_title Cluster Objects'
+    echo 'graph_args --base 1000'
+    echo 'graph_vlabel Objects'
+    echo 'graph_category ceph'
+    echo 'graph_info This graph shows the cluster object count.'
+
+    echo 'cluster_objects.label Cluster Objects'
+    echo 'cluster_objects.type GAUGE'
+    echo 'cluster_objects.min 0'
+    echo 'cluster_objects.info Total objects in the cluster.'
+
+    POOL_LIST="$( $RADOSDF_CMD | jq -r '.pools[].name' )"
+
+    # Graph set 3 - Cluster I/O Bytes Lifetime
+    echo 'multigraph pool_rdbytes'
+    echo "graph_title IO Bytes (Lifetime)"
+    echo "graph_args --base 1000"
+    echo "graph_vlabel bytes read (-) / write (+)"
+    echo "graph_category ceph"
+    echo "graph_info This graph shows the lifetime cluster bytes."
+    for pool in ${POOL_LIST}; do
+        # Graph set 3 - Cluster I/O Bytes Lifetime
+        echo "pool_rdbytes_${pool}.label Pool ${pool} IO (Bytes)"
+        echo "pool_rdbytes_${pool}.type GAUGE"
+        echo "pool_rdbytes_${pool}.min 0"
+        echo "pool_rdbytes_${pool}.draw LINE1"
+        echo "pool_rdbytes_${pool}.graph no"
+        echo "pool_wrbytes_${pool}.label Pool ${pool} IO (Bytes)"
+        echo "pool_wrbytes_${pool}.type GAUGE"
+        echo "pool_wrbytes_${pool}.min 0"
+        echo "pool_wrbytes_${pool}.draw LINE1"
+        echo "pool_wrbytes_${pool}.negative pool_rdbytes_${pool}"
+    done
+
+    # Graph set 4 - Cluster I/O Operations Lifetime
+    echo 'multigraph pool_rdops'
+    echo "graph_title IO Operations (Lifetime)"
+    echo "graph_args --base 1000"
+    echo "graph_vlabel IOs read (-) / write (+)"
+    echo "graph_category ceph"
+    echo "graph_info This graph shows the lifetime cluster IOs."
+    for pool in ${POOL_LIST}; do
+        # Graph set 4 - Cluster I/O Operations Lifetime
+        echo "pool_rdops_${pool}.label Pool ${pool} IO (Ops)"
+        echo "pool_rdops_${pool}.type GAUGE"
+        echo "pool_rdops_${pool}.min 0"
+        echo "pool_rdops_${pool}.draw LINE1"
+        echo "pool_rdops_${pool}.graph no"
+        echo "pool_wrops_${pool}.label Pool ${pool} IO (Ops)"
+        echo "pool_wrops_${pool}.type GAUGE"
+        echo "pool_wrops_${pool}.min 0"
+        echo "pool_wrops_${pool}.draw LINE1"
+        echo "pool_wrops_${pool}.negative pool_rdops_${pool}"
+    done
+
+    # Graph set 5 - Ceph pool objects
+    echo 'multigraph pool_objects_total'
+    echo "graph_title Objects"
+    echo "graph_args --base 1000"
+    echo "graph_vlabel Objects"
+    echo "graph_category ceph"
+    echo "graph_info This graph shows the cluster object count."
+    for pool in ${POOL_LIST}; do
+        # Graph set 5 - Ceph pool objects
+        echo "pool_objects_total_${pool}.label Pool ${pool} Objects"
+        echo "pool_objects_total_${pool}.type GAUGE"
+        echo "pool_objects_total_${pool}.min 0"
+        echo "pool_objects_total_${pool}.info Total objects in the pool."
+    done
+
+    # Graph set 6 - Ceph pool objects copies
+    echo 'multigraph pool_objects_copies'
+    echo "graph_title Objects Copies"
+    echo "graph_args --base 1000"
+    echo "graph_vlabel Objects"
+    echo "graph_category ceph"
+    echo "graph_info This graph shows the cluster object copy count."
+    for pool in ${POOL_LIST}; do
+        # Graph set 6 - Ceph pool objects copies
+        echo "pool_objects_copies_${pool}.label Pool ${pool} Objects Copies"
+        echo "pool_objects_copies_${pool}.type GAUGE"
+        echo "pool_objects_copies_${pool}.min 0"
+        echo "pool_objects_copies_${pool}.info Total object copies in the pool."
+    done
+
+    # Graph set 7 - Ceph pool objects degraded
+    echo 'multigraph pool_objects_degraded'
+    echo "graph_title Objects Degraded"
+    echo "graph_args --base 1000"
+    echo "graph_vlabel Objects"
+    echo "graph_category ceph"
+    echo "graph_info This graph shows the cluster object degraded count."
+    for pool in ${POOL_LIST}; do
+        # Graph set 7 - Ceph pool objects degraded
+        echo "pool_objects_degraded_${pool}.label Pool ${pool} Objects Degraded"
+        echo "pool_objects_degraded_${pool}.type GAUGE"
+        echo "pool_objects_degraded_${pool}.min 0"
+        echo "pool_objects_degraded_${pool}.info Total degraded objects in the pool."
+    done
+
+    OSD_LIST="$( $OSDDF_CMD | jq -r '.nodes[].id' | sort -n )"
+
+    # Graph set 8 - Ceph OSD status
+    echo 'multigraph osd_status'
+    echo "graph_title OSD Status"
+    echo "graph_args --base 1000"
+    echo "graph_vlabel Status Up (1) / Down (0)"
+    echo "graph_category ceph"
+    echo "graph_info This graph shows the OSD status."
+    for osd in ${OSD_LIST}; do
+        # Graph set 8 - Ceph OSD status
+        echo "osd_status_${osd}.label osd.${osd} Status"
+        echo "osd_status_${osd}.type GAUGE"
+        echo "osd_status_${osd}.min 0"
+        echo "osd_status_${osd}.max 1"
+        echo "osd_status_${osd}.info Status of the OSD."
+    done
+
+    # Graph set 9 - Ceph OSD utilization
+    echo 'multigraph osd_utilization'
+    echo "graph_title OSD Utilization"
+    echo "graph_args --base 1000"
+    echo "graph_vlabel % Utilization"
+    echo "graph_category ceph"
+    echo "graph_info This graph shows the OSD utilization."
+    for osd in ${OSD_LIST}; do
+        # Graph set 9 - Ceph OSD utilization
+        echo "osd_utilization_${osd}.label osd.${osd} Utilization"
+        echo "osd_utilization_${osd}.type GAUGE"
+        echo "osd_utilization_${osd}.max 100"
+        echo "osd_utilization_${osd}.info Utilization of the OSD."
+    done
+
+    exit 0
+}
+
+output_values() {
+    RADOS_JSON_OUTPUT="$( $RADOSDF_CMD )"
+    OSD_JSON_OUTPUT="$( $OSDDF_CMD )"
+
+    cluster_utilization="$( $JQ_CMD -r '.total_used' <<<"${RADOS_JSON_OUTPUT}" )"
+    cluster_size="$( $JQ_CMD -r '.total_space' <<<"${RADOS_JSON_OUTPUT}" )"
+    pct_utilization="$( echo "scale=4; ${cluster_utilization} / ${cluster_size} * 100" | bc -l )"
+    cluster_objects="$( $JQ_CMD -r '.total_objects' <<<"${RADOS_JSON_OUTPUT}" )"
+
+    echo "multigraph cluster_utilization"
+    echo "cluster_utilization.value ${pct_utilization}"
+    echo "multigraph cluster_objects"
+    echo "cluster_objects.value ${cluster_objects}"
+
+    cluster_pool_count="$( $JQ_CMD -r '.pools[].name' <<<"${RADOS_JSON_OUTPUT}" | wc -l )"
+    echo "multigraph pool_rdbytes"
+    for id in $( seq 0 $(( ${cluster_pool_count} - 1 )) ); do
+        pool="$( $JQ_CMD -r ".pools[$id].name" <<<"${RADOS_JSON_OUTPUT}" )"
+        pool_rdbytes="$( $JQ_CMD -r ".pools[$id].read_bytes" <<<"${RADOS_JSON_OUTPUT}" )"
+        pool_wrbytes="$( $JQ_CMD -r ".pools[$id].write_bytes" <<<"${RADOS_JSON_OUTPUT}" )"
+        echo "pool_rdbytes_${pool}.value ${pool_rdbytes}"
+        echo "pool_wrbytes_${pool}.value ${pool_wrbytes}"
+    done
+
+    echo "multigraph pool_rdops"
+    for id in $( seq 0 $(( ${cluster_pool_count} - 1 )) ); do
+        pool="$( $JQ_CMD -r ".pools[$id].name" <<<"${RADOS_JSON_OUTPUT}" )"
+        pool_rdops="$( $JQ_CMD -r ".pools[$id].read_ops" <<<"${RADOS_JSON_OUTPUT}" )"
+        pool_wrops="$( $JQ_CMD -r ".pools[$id].write_ops" <<<"${RADOS_JSON_OUTPUT}" )"
+        echo "pool_rdops_${pool}.value ${pool_rdops}"
+        echo "pool_wrops_${pool}.value ${pool_wrops}"
+    done
+
+    echo "multigraph pool_objects_total"
+    for id in $( seq 0 $(( ${cluster_pool_count} - 1 )) ); do
+        pool="$( $JQ_CMD -r ".pools[$id].name" <<<"${RADOS_JSON_OUTPUT}" )"
+        pool_objects="$( $JQ_CMD -r ".pools[$id].num_objects" <<<"${RADOS_JSON_OUTPUT}" )"
+        echo "pool_objects_total_${pool}.value ${pool_objects}"
+    done
+
+    echo "multigraph pool_objects_copies"
+    for id in $( seq 0 $(( ${cluster_pool_count} - 1 )) ); do
+        pool="$( $JQ_CMD -r ".pools[$id].name" <<<"${RADOS_JSON_OUTPUT}" )"
+        pool_copies="$( $JQ_CMD -r ".pools[$id].num_object_copies" <<<"${RADOS_JSON_OUTPUT}" )"
+        echo "pool_objects_copies_${pool}.value ${pool_copies}"
+    done
+
+    echo "multigraph pool_objects_degraded"
+    for id in $( seq 0 $(( ${cluster_pool_count} - 1 )) ); do
+        pool="$( $JQ_CMD -r ".pools[$id].name" <<<"${RADOS_JSON_OUTPUT}" )"
+        pool_degraded="$( $JQ_CMD -r ".pools[$id].num_objects_degraded" <<<"${RADOS_JSON_OUTPUT}" )"
+        echo "pool_objects_degraded_${pool}.value ${pool_degraded}"
+    done
+
+    cluster_osd_count="$( $JQ_CMD -r '.nodes[].id' <<<"${OSD_JSON_OUTPUT}" | wc -l)"
+    echo "multigraph osd_status"
+    for id in $( seq 0 $(( ${cluster_osd_count} - 1 )) ); do
+        osd="$( $JQ_CMD -r ".nodes[$id].id" <<<"${OSD_JSON_OUTPUT}" )"
+        osd_status="$( $JQ_CMD -r ".nodes[$id].status" <<<"${OSD_JSON_OUTPUT}" )"
+        case ${osd_status} in
+            up)
+                osd_status="1"
+                ;;
+            *)
+                osd_status="0"
+                ;;
+        esac
+        echo "osd_status_${osd}.value ${osd_status}"
+    done
+
+    echo "multigraph osd_utilization"
+    for id in $( seq 0 $(( ${cluster_osd_count} - 1 )) ); do
+        osd="$( $JQ_CMD -r ".nodes[$id].id" <<<"${OSD_JSON_OUTPUT}" )"
+        osd_utilization="$( $JQ_CMD -r ".nodes[$id].utilization" <<<"${OSD_JSON_OUTPUT}" )"
+        echo "osd_utilization_${osd}.value ${osd_utilization}"
+    done
+}
+
+case $# in
+    0)
+        output_values
+        ;;
+    1)
+        case $1 in
+            autoconf)
+                output_autoconf
+                ;;
+            config)
+                output_config
+                ;;
+            *)
+                output_usage
+                exit 1
+                ;;
+        esac
+        ;;
+    *)
+        output_usage
+        exit 1
+esac
--- a/monitoring/munin/pvc
+++ b/monitoring/munin/pvc
@ -0,0 +1,182 @@
+#!/bin/bash
+# -*- sh -*-
+
+: << =cut
+
+=head1 NAME
+
+pvc - Plugin to monitor a PVC cluster.
+
+=head1 AUTHOR
+
+Joshua Boniface <joshua@boniface.me>
+
+=head1 LICENSE
+
+GPLv3
+
+=head1 BUGS
+
+=back
+
+=head1 MAGIC MARKERS
+
+ #%# family=auto
+ #%# capabilities=autoconf
+
+=cut
+
+. "$MUNIN_LIBDIR/plugins/plugin.sh"
+
+is_multigraph
+
+warning=0.99
+critical=1.99
+
+export PVC_CLIENT_DIR="/run/shm/munin-pvc"
+PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
+JQ_CMD="/usr/bin/jq"
+
+output_usage() {
+    echo "This plugin outputs information about a PVC cluster and node"
+    exit 0
+}
+
+output_autoconf() {
+    $PVC_CMD &>/dev/null
+    pvc_ret=$?
+    $JQ_CMD --version &>/dev/null
+    jq_ret=$?
+
+    if [[ ${pvc_ret} -eq 0 && ${jq_ret} -eq 0 ]]; then
+        echo "yes"
+    elif [[ ${pvc_ret} -ne 0 ]]; then
+        echo "no (no 'pvc' command found or local cluster not usable)"
+    elif [[ ${jq_ret} -ne 0 ]]; then
+        echo "no (no 'jq' command found)"
+    else
+        echo "no (generic failure)"
+    fi
+}
+
+output_config() {
+    echo 'multigraph pvc_cluster_health'
+    echo 'graph_title PVC Cluster Health'
+    echo 'graph_args --base 1000'
+    echo 'graph_vlabel Health%'
+    echo 'graph_category pvc'
+    echo 'graph_info Health of the PVC cluster'
+
+    echo 'pvc_cluster_health.label Cluster Health'
+    echo 'pvc_cluster_health.type GAUGE'
+    echo 'pvc_cluster_health.max 100'
+    echo 'pvc_cluster_health.min 0'
+    echo 'pvc_cluster_health.info Health of the PVC cluster in %'
+
+    echo 'multigraph pvc_cluster_alert'
+    echo 'graph_title PVC Cluster Alerting'
+    echo 'graph_args --base 1000'
+    echo 'graph_vlabel State'
+    echo 'graph_category pvc'
+    echo 'graph_info Alerting state of the PVC cluster health'
+
+    echo 'pvc_cluster_alert.label Cluster Health State'
+    echo 'pvc_cluster_alert.type GAUGE'
+    echo 'pvc_cluster_alert.max 2'
+    echo 'pvc_cluster_alert.min 0'
+    echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health'
+    print_warning pvc_cluster_alert
+    print_critical pvc_cluster_alert
+
+    echo 'multigraph pvc_node_health'
+    echo 'graph_title PVC Node Health'
+    echo 'graph_args --base 1000'
+    echo 'graph_vlabel Health%'
+    echo 'graph_category pvc'
+    echo 'graph_info Health of the PVC node'
+
+    echo 'pvc_node_health.label Node Health'
+    echo 'pvc_node_health.type GAUGE'
+    echo 'pvc_node_health.max 100'
+    echo 'pvc_node_health.min 0'
+    echo 'pvc_node_health.info Health of the PVC node in %'
+
+    echo 'multigraph pvc_node_alert'
+    echo 'graph_title PVC Node Alerting'
+    echo 'graph_args --base 1000'
+    echo 'graph_vlabel State'
+    echo 'graph_category pvc'
+    echo 'graph_info Alerting state of the PVC node health'
+
+    echo 'pvc_node_alert.label Node Health State'
+    echo 'pvc_node_alert.type GAUGE'
+    echo 'pvc_node_alert.max 2'
+    echo 'pvc_node_alert.min 0'
+    echo 'pvc_node_alert.info Alerting state of the PVC node health'
+    print_warning pvc_node_alert
+    print_critical pvc_node_alert
+
+    exit 0
+}
+
+output_values() {
+    PVC_OUTPUT="$( $PVC_CMD )"
+    HOST="$( hostname --short )"
+
+    is_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )"
+
+    cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
+    cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
+    echo 'multigraph pvc_cluster_health'
+    echo "pvc_cluster_health.value ${cluster_health}"
+    echo "pvc_cluster_health.extinfo ${cluster_health_messages}"
+
+    if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then
+        cluster_health_alert=2
+    elif [[ ${cluster_health} -le 90 && ${is_maintenance} == "false" ]]; then
+        cluster_health_alert=1
+    else
+        cluster_health_alert=0
+    fi
+    echo 'multigraph pvc_cluster_alert'
+    echo "pvc_cluster_alert.value ${cluster_health_alert}"
+
+    node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
+    node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
+    echo 'multigraph pvc_node_health'
+    echo "pvc_node_health.value ${node_health}"
+    echo "pvc_node_health.extinfo ${node_health_messages}"
+
+    if [[ ${node_health} -le 50 && ${is_maintenance} != "true" ]]; then
+        node_health_alert=2
+    elif [[ ${node_health} -le 90 && ${is_maintenance} != "true" ]]; then
+        node_health_alert=1
+    else
+        node_health_alert=0
+    fi
+    echo 'multigraph pvc_node_alert'
+    echo "pvc_node_alert.value ${node_health_alert}"
+}
+
+case $# in
+    0)
+        output_values
+        ;;
+    1)
+        case $1 in
+            autoconf)
+                output_autoconf
+                ;;
+            config)
+                output_config
+                ;;
+            *)
+                output_usage
+                exit 1
+                ;;
+        esac
+        ;;
+    *)
+        output_usage
+        exit 1
+esac
--- a/monitoring/prometheus/grafana-pvc-health-dashboard.json
+++ b/monitoring/prometheus/grafana-pvc-health-dashboard.json
--- a/monitoring/prometheus/prometheus.yml
+++ b/monitoring/prometheus/prometheus.yml
@ -0,0 +1,8 @@
+# Other configuration omitted
+scrape_configs:
+  - job_name: "pvc_cluster"
+    metrics_path: /api/v1/metrics
+    scheme: "http"
+    file_sd_configs:
+      - files:
+          - 'targets-pvc_cluster.json'
--- a/monitoring/prometheus/targets-pvc_cluster.json
+++ b/monitoring/prometheus/targets-pvc_cluster.json
@ -0,0 +1,11 @@
+[
+  {
+    "targets": [
+      "pvc.upstream.floating.address.tld:7370"
+    ],
+    "labels": {
+      "cluster": "cluster1"
+    }
+  }
+]
+