Update Munin plugin example

Add CheckMK monitoring example plugins
Format cluster health like node healths
2023-02-16 16:06:00 -05:00 · 2023-02-16 16:05:47 -05:00 · 2023-02-16 12:33:36 -05:00 · 2023-02-16 12:33:18 -05:00 · 2023-02-15 21:35:44 -05:00 · 2023-02-15 16:50:24 -05:00
9 changed files with 260 additions and 131 deletions
--- a/.flake8
+++ b/.flake8
@@ -6,7 +6,7 @@
 ignore = W503, E501
 extend-ignore = E203
 # We exclude the Debian, migrations, and provisioner examples
-exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples
+exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples,node-daemon/monitoring
 # Set the max line length to 88 for Black
 max-line-length = 88
--- a/client-cli/pvc/cli_lib/cluster.py
+++ b/client-cli/pvc/cli_lib/cluster.py
@@ -125,11 +125,11 @@ def format_info(cluster_information, oformat):
        return json.dumps(cluster_information, indent=4)
    # Plain formatting, i.e. human-readable
-    if cluster_information["maintenance"] == "True":
+    if cluster_information["maintenance"] == "true":
        health_colour = ansiprint.blue()
-    elif cluster_information["health"] > 90:
+    elif cluster_information["cluster_health"]["health"] > 90:
        health_colour = ansiprint.green()
-    elif cluster_information["health"] > 50:
+    elif cluster_information["cluster_health"]["health"] > 50:
        health_colour = ansiprint.yellow()
    else:
        health_colour = ansiprint.red()
@@ -141,8 +141,8 @@ def format_info(cluster_information, oformat):
    )
    ainformation.append("")
-    health_text = f"{cluster_information['health']}%"
+    health_text = f"{cluster_information['cluster_health']['health']}%"
-    if cluster_information["maintenance"] == "True":
+    if cluster_information["maintenance"] == "true":
        health_text += " (maintenance on)"
    ainformation.append(
@@ -154,9 +154,9 @@ def format_info(cluster_information, oformat):
            ansiprint.end(),
        )
    )
-    if cluster_information["health_messages"]:
+    if cluster_information["cluster_health"]["messages"]:
        health_messages = "\n                 > ".join(
-            sorted(cluster_information["health_messages"])
+            sorted(cluster_information["cluster_health"]["messages"])
        )
        ainformation.append(
            "{}Health messages:{} > {}".format(
--- a/client-cli/pvc/pvc.py
+++ b/client-cli/pvc/pvc.py
@@ -697,15 +697,29 @@ def node_log(node, lines, follow):
    default=False,
    help="Display more detailed information.",
 )
@click.option(
    "-f",
    "--format",
    "oformat",
    default="plain",
    show_default=True,
    type=click.Choice(["plain", "json", "json-pretty"]),
    help="Output format of node status information.",
 )
@cluster_req
-def node_info(node, long_output):
+def node_info(node, long_output, oformat):
    """
    Show information about node NODE. If unspecified, defaults to this host.
    """
    retcode, retdata = pvc_node.node_info(config, node)
    if retcode:
-        retdata = pvc_node.format_info(retdata, long_output)
+        if oformat == "json":
            retdata = json.dumps(retdata)
        elif oformat == "json-pretty":
            retdata = json.dumps(retdata, indent=4)
        else:
            retdata = pvc_node.format_info(retdata, long_output)
    cleanup(retcode, retdata)
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -57,8 +57,8 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
    }
    # Generate total cluster health numbers
-    cluster_health = 100
+    cluster_health_value = 100
-    messages = list()
+    cluster_health_messages = list()
    for index, node in enumerate(node_list):
        # Apply node health values to total health number
@@ -66,31 +66,33 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
            node_health_int = int(node["health"])
        except Exception:
            node_health_int = 100
-        cluster_health -= 100 - node_health_int
+        cluster_health_value -= 100 - node_health_int
        for entry in node["health_details"]:
            if entry["health_delta"] > 0:
-                messages.append(
+                cluster_health_messages.append(
                    f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
                )
        # Handle unhealthy node states
        if node["daemon_state"] not in ["run"]:
-            cluster_health -= health_delta_map["node_stopped"]
+            cluster_health_value -= health_delta_map["node_stopped"]
-            messages.append(
+            cluster_health_messages.append(
                f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
            )
        elif node["domain_state"] not in ["ready"]:
-            cluster_health -= health_delta_map["node_flushed"]
+            cluster_health_value -= health_delta_map["node_flushed"]
-            messages.append(
+            cluster_health_messages.append(
                f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
            )
    for index, vm in enumerate(vm_list):
        # Handle unhealthy VM states
        if vm["state"] not in ["start", "disable", "migrate", "unmigrate", "provision"]:
-            cluster_health -= health_delta_map["vm_stopped"]
+            cluster_health_value -= health_delta_map["vm_stopped"]
-            messages.append(f"cluster: VM {vm['name']} in {vm['state'].upper()} state")
+            cluster_health_messages.append(
                f"cluster: VM {vm['name']} in {vm['state'].upper()} state"
            )
    for index, ceph_osd in enumerate(ceph_osd_list):
        in_texts = {1: "in", 0: "out"}
@@ -98,13 +100,13 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
        # Handle unhealthy OSD states
        if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
-            cluster_health -= health_delta_map["osd_out"]
+            cluster_health_value -= health_delta_map["osd_out"]
-            messages.append(
+            cluster_health_messages.append(
                f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
            )
        elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
-            cluster_health -= health_delta_map["osd_down"]
+            cluster_health_value -= health_delta_map["osd_down"]
-            messages.append(
+            cluster_health_messages.append(
                f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
            )
@@ -131,8 +133,8 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
    for index, node in enumerate(n_minus_1_node_list):
        n_minus_1_total += node["memory"]["total"]
    if alloc_total > n_minus_1_total:
-        cluster_health -= health_delta_map["memory_overprovisioned"]
+        cluster_health_value -= health_delta_map["memory_overprovisioned"]
-        messages.append(
+        cluster_health_messages.append(
            f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
        )
@@ -146,16 +148,43 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
        "HEALTH_WARN": "WARNING",
    }
    for entry in ceph_health_entries:
-        messages.append(
+        cluster_health_messages.append(
            f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
        )
    if ceph_health_status == "HEALTH_ERR":
-        cluster_health -= health_delta_map["ceph_err"]
+        cluster_health_value -= health_delta_map["ceph_err"]
    elif ceph_health_status == "HEALTH_WARN":
-        cluster_health -= health_delta_map["ceph_warn"]
+        cluster_health_value -= health_delta_map["ceph_warn"]
-    return cluster_health, messages
+    if cluster_health_value < 0:
        cluster_health_value = 0
    cluster_health = {
        "health": cluster_health_value,
        "messages": cluster_health_messages,
    }
    return cluster_health
 def getNodeHealth(zkhandler, node_list):
    node_health = dict()
    for index, node in enumerate(node_list):
        node_health_messages = list()
        node_health_value = node["health"]
        for entry in node["health_details"]:
            if entry["health_delta"] > 0:
                node_health_messages.append(f"'{entry['name']}': {entry['message']}")
        node_health_entry = {
            "health": node_health_value,
            "messages": node_health_messages,
        }
        node_health[node["name"]] = node_health_entry
    return node_health
 def getClusterInformation(zkhandler):
@@ -259,15 +288,12 @@ def getClusterInformation(zkhandler):
        if state_count > 0:
            formatted_osd_states[state] = state_count
    # Get cluster health data
    cluster_health, cluster_health_messages = getClusterHealth(
        zkhandler, node_list, vm_list, ceph_osd_list
    )
    # Format the status data
    cluster_information = {
-        "health": cluster_health,
+        "cluster_health": getClusterHealth(
-        "health_messages": cluster_health_messages,
+            zkhandler, node_list, vm_list, ceph_osd_list
        ),
        "node_health": getNodeHealth(zkhandler, node_list),
        "maintenance": maintenance_state,
        "primary_node": common.getPrimaryNode(zkhandler),
        "upstream_ip": zkhandler.read("base.config.upstream_ip"),
--- a/node-daemon/monitoring/README.md
+++ b/node-daemon/monitoring/README.md
@@ -2,23 +2,34 @@
 This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.
-### Munin
+## Munin
-The included munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time.
+The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
-When using this plugin, it might be useful to adjust the thresholds with a plugin configuration. For instance, one could adjust the Degraded value from CRITICAL to WARNING by adjusting the critical threshold to a value higher than 1.99 (e.g. 3, 10, etc.) so that only the WARNING threshold will be hit. Alternatively one could instead make Maintenance mode trigger a WARNING by lowering the threshold to 0.99.
+* `pvc`: Checks the PVC cluster and node health, providing two graphs, one for each.
-Example plugin configuration:
+* `ceph_utilization`: Checks the Ceph cluster statistics, providing multiple graphs. Note that this plugin is independent of PVC itself, and makes local calls to various Ceph commands itself.
-```
+The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK.
 [pvc]
 # Make cluster warn on maintenance
 env.pvc_cluster_warning 0.99
 # Disable critical threshold (>2)
 env.pvc_cluster_critical 3
 # Make storage warn on maintenance, crit on degraded (latter is default)
 env.pvc_storage_warning 0.99
 env.pvc_storage_critical 1.99
 ```
-### Check_MK
+The `ceph_utilization` plugin provides no configuration; only the cluster utilization graph alerts such that >80% used is warning and >90% used is critical. Ceph itself begins warning above 80% as well.
 ## CheckMK
 The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin. This monitoring server plugin requires CheckMK version 2.0 or higher. The two parts can be installed as follows:
 * `pvc`: Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node.
 * `pvc.py`: Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory on the CheckMK monitoring host for each monitoring site.
 The plugin provides no configuration: the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK.
 With both the agent and server plugins installed, you can then run `cmk -II <node>` (or use WATO) to inventory each node, which should produce two new checks:
 * `PVC Cluster`: Provides the cluster-wide health. Note that this will be identical for all nodes in the cluster (i.e. if the cluster health drops, all nodes in the cluster will alert this check).
 * `PVC Node <shortname>`: Provides the per-node health.
 The "Summary" text, shown in the check lists, will be simplistic, only showing the current health percentage.
 The "Details" text, found in the specific check details, will show the full list of problem(s) the check finds, as shown by `pvc status` itself.
--- a/node-daemon/monitoring/checkmk/pvc
+++ b/node-daemon/monitoring/checkmk/pvc
@@ -0,0 +1,6 @@
 #!/bin/bash
 # PVC cluster status check for Check_MK (agent-side)
 echo "<<<pvc>>>"
 pvc --quiet status --format json
--- a/node-daemon/monitoring/checkmk/pvc.py
+++ b/node-daemon/monitoring/checkmk/pvc.py
@@ -0,0 +1,95 @@
 #!/usr/bin/env python3
 #
 # Check_MK PVC plugin
 #
 # Copyright 2017-2021, Joshua Boniface <joshua@boniface.me>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 from .agent_based_api.v1 import *
 from cmk.base.check_api import host_name
 from time import time
 from json import loads
 def discover_pvc(section):
    my_node = host_name().split(".")[0]
    yield Service(item=f"PVC Node {my_node}")
    yield Service(item="PVC Cluster")
 def check_pvc(item, params, section):
    state = State.OK
    summary = "Stuff"
    details = None
    data = loads(" ".join(section[0]))
    my_node = host_name().split(".")[0]
    maintenance_map = {
        "true": "on",
        "false": "off",
    }
    maintenance = maintenance_map[data["maintenance"]]
    # Node check
    if item == f"PVC Node {my_node}":
        my_node = host_name().split(".")[0]
        node_health = data["node_health"][my_node]["health"]
        node_messages = data["node_health"][my_node]["messages"]
        summary = f"Node health is {node_health}% (maintenance {maintenance})"
        if len(node_messages) > 0:
            details = ", ".join(node_messages)
        if node_health <= 50 and maintenance == "off":
            state = State.CRIT
        elif node_health <= 90 and maintenance == "off":
            state = State.WARN
        else:
            state = State.OK
        yield Metric(name="node-health", value=node_health)
    # Cluster check
    elif item == "PVC Cluster":
        cluster_health = data["cluster_health"]["health"]
        cluster_messages = data["cluster_health"]["messages"]
        summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})"
        if len(cluster_messages) > 0:
            details = ", ".join(cluster_messages)
        if cluster_health <= 50 and maintenance == "off":
            state = State.CRIT
        elif cluster_health <= 90 and maintenance == "off":
            state = State.WARN
        else:
            state = State.OK
        yield Metric(name="cluster-health", value=cluster_health)
    yield Result(state=state, summary=summary, details=details)
    return
 register.check_plugin(
    name="pvc",
    service_name="%s",
    check_ruleset_name="pvc",
    discovery_function=discover_pvc,
    check_function=check_pvc,
    check_default_parameters={},
 )
--- a/node-daemon/monitoring/munin/pvc
+++ b/node-daemon/monitoring/munin/pvc
@@ -7,23 +7,6 @@
 pvc - Plugin to monitor a PVC cluster.
 =head1 CONFIGURATION
 Note that due to how Munin thresholds work, these values must always be slightly less than 1 or 2 respectively,
 or the alerts will never be triggered.
 Defaults (no config required):
 [pvc]
 env.warning 1.99
 env.critical 1.99
 Make degraded cluster WARN only (max value is 2, so 3 effectively disables):
 [pvc]
 env.pvc_cluster_warning 1.99
 env.pvc_cluster_critical 3
 =head1 AUTHOR
 Joshua Boniface <joshua@boniface.me>
@@ -45,24 +28,14 @@ GPLv3
 . "$MUNIN_LIBDIR/plugins/plugin.sh"
-warning=1.99
+warning=1
-critical=1.99
+critical=2
 export PVC_CLIENT_DIR="/run/shm/munin-pvc"
 PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
 JQ_CMD="/usr/bin/jq"
 output_usage() {
    echo "This plugin outputs numerical values based on the health of the PVC cluster."
    echo
    echo "There are separate outputs for both the PVC cluster itself as well as the Ceph storage cluster."
    echo "In normal operation, i.e. when both clusters are in 'Optimal' state, the plugin returns 0 for"
    echo "each cluster. When the cluster is placed into 'Maintenance' mode,the plugin returns 1 for each"
    echo "cluster, and goes into WARN state (limit 0.99); this can be adjusted by overriding the WARNING"
    echo "threshold of the plugin to something other than 0.99 - note that due to Munin's alerting design,"
    echo "the warning value must always be very slightly below the whole number. When either cluster"
    echo "element becomes 'Degraded', the plugin returns 2 for the relevant cluster, which is treated as a"
    echo "critical. Like the WARNING threshold, this can be overridden, and with the same caveat about limit."
    exit 0
 }
@@ -84,72 +57,73 @@ output_autoconf() {
 }
 output_config() {
-    echo 'graph_title PVC Clusters'
+    echo 'graph_title PVC CHealth'
-    echo 'graph_args --base 1000'
+    echo 'graph_args --base 100'
    echo 'graph_vlabel Count'
    echo 'graph_category pvc'
    echo 'graph_period second'
-    echo 'graph_info This graph shows the nodes in the PVC cluster.'
+    echo 'graph_info These graphs show the health of the PVC cluster and specific node.'
-    echo 'pvc_cluster.label Cluster Degradation'
+    echo 'pvc_cluster.label Cluster Health'
    echo 'pvc_cluster.type GAUGE'
-    echo 'pvc_cluster.max 2'
+    echo 'pvc_cluster.max 100'
-    echo 'pvc_cluster.info Whether the PVC cluster is in a degraded state.'
+    echo 'pvc_cluster.info Health of the PVC cluster in %.'
    print_warning pvc_cluster
    print_critical pvc_cluster
-    echo 'pvc_storage.label Storage Degradation'
+    echo 'pvc_cluster_alert.label Cluster Health State'
-    echo 'pvc_storage.type GAUGE'
+    echo 'pvc_cluster_alert.type GAUGE'
-    echo 'pvc_storage.max 2'
+    echo 'pvc_cluster_alert.max 2',
-    echo 'pvc_storage.info Whether the storage cluster is in a degraded state.'
+    echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health'
-    print_warning pvc_storage
+    print_warning pvc_cluster_alert
-    print_critical pvc_storage
+    print_critical pvc_cluster_alert
    echo 'pvc_node.label Node Health'
    echo 'pvc_node.type GAUGE'
    echo 'pvc_node.max 100'
    echo 'pvc_node.info Health of the PVC node in %.'
    echo 'pvc_node_alert.label Node Health State'
    echo 'pvc_node_alert.type GAUGE'
    echo 'pvc_node_alert.max 2',
    echo 'pvc_node_alert.info Alerting state of the PVC node health'
    print_warning pvc_node_alert
    print_critical pvc_node_alert
    exit 0
 }
 output_values() {
    PVC_OUTPUT="$( $PVC_CMD )"
    HOST="$( hostname --short )"
-    cluster_health="$( $JQ_CMD '.health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
+    in_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )"
    cluster_failed_reason="$( $JQ_CMD -r '.health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
    case $cluster_health in
        "Optimal")
            cluster_value="0"
            ;;
        "Maintenance")
            cluster_value="1"
            ;;
        "Degraded")
            cluster_value="2"
    esac
-    storage_health="$( $JQ_CMD '.storage_health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
+    cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
-    storage_failed_reason="$( $JQ_CMD -r '.storage_health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
+    cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
-    case $storage_health in
+    echo "pvc_cluster.value ${cluster_health}"
-        "Optimal")
+    echo "pvc_cluster.extinfo ${cluster_health_messages}"
            storage_value="0"
            ;;
        "Maintenance")
            storage_value="1"
            ;;
        "Degraded")
            storage_value="2"
    esac
-
+    if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then
-    echo "pvc_cluster.value $cluster_value"
+        cluster_health_alert=2
-    if [[ $cluster_value -eq 1 ]]; then
+    elif [[ ${cluster_health} -le 90 && ${is_maintenance} == "false" ]]; then
-        echo "pvc_cluster.extinfo Cluster in maintenance mode"
+        cluster_health_alert=1
-    elif [[ $cluster_value -eq 2 ]]; then
+    else
-        echo "pvc_cluster.extinfo ${cluster_failed_reason}"
+        cluster_health_alert=0
    fi
-    echo "pvc_storage.value $storage_value"
+    echo "pvc_cluster_alert.value ${cluster_health_alert}"
-    if [[ $storage_value -eq 1 ]]; then
+
-        echo "pvc_storage.extinfo Cluster in maintenance mode"
+    node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
-    elif [[ $storage_value -eq 2 ]]; then
+    node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
-        echo "pvc_storage.extinfo ${storage_failed_reason}"
+    echo "pvc_node.value ${node_health}"
    echo "pvc_node.extinfo ${node_health_messages}"
    if [[ ${node_health} -le 50 && ${is_maintenance} == "false" ]]; then
        node_health_alert=2
    elif [[ ${node_health} -le 90 && ${is_maintenance} == "false" ]]; then
        node_health_alert=1
    else
        node_health_alert=0
    fi
    echo "pvc_node_alert.value ${node_health_alert}"
 }
 case $# in
--- a/node-daemon/pvcnoded/objects/MonitoringInstance.py
+++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py
@@ -366,6 +366,9 @@ class MonitoringInstance(object):
            if result is not None:
                total_health -= result.health_delta
        if total_health < 0:
            total_health = 0
        if total_health > 90:
            health_colour = self.logger.fmt_green
        elif total_health > 50:
Author	SHA1	Message	Date
Joshua M. Boniface	396f424f80	Update Munin plugin example	2023-02-16 16:06:00 -05:00
Joshua M. Boniface	529e6d6878	Add CheckMK monitoring example plugins	2023-02-16 16:05:47 -05:00
Joshua M. Boniface	75639c17d9	Format cluster health like node healths Make a cleaner construct here.	2023-02-16 12:33:36 -05:00
Joshua M. Boniface	3c6c33a326	Exclude monitoring examples from flake8	2023-02-16 12:33:18 -05:00
Joshua M. Boniface	25d0fde5e4	Add JSON output format for node info	2023-02-15 21:35:44 -05:00
Joshua M. Boniface	4ab0bdd9e8	Disallow health less than 0	2023-02-15 16:50:24 -05:00
Joshua M. Boniface	21965d280c	Fix comparison in maintenance check	2023-02-15 16:47:31 -05:00
Joshua M. Boniface	3408e27355	Add per-node health entries for 3rd party checks	2023-02-15 16:44:49 -05:00