Add node health value and send out API

2023-02-13 14:37:44 -05:00
parent d8f346abdd
commit 9c14d84bfc
8 changed files with 126 additions and 142 deletions
--- a/node-daemon/plugins/ceph
+++ b/node-daemon/plugins/ceph
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3

-# ceph.py - PVC Monitoring example plugin for ceph status
+# ceph.py - PVC Monitoring example plugin for Ceph status
 # Part of the Parallel Virtual Cluster (PVC) system
 #
 #    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
@@ -111,7 +111,7 @@ class MonitoringPluginScript(MonitoringPlugin):
        self.plugin_result.set_message(message)

        # Set the detailed data in our local PluginResult object
-        self.plugin_result.set_data(dumps(health_status))
+        self.plugin_result.set_data(health_status)

        # Return our local PluginResult object
        return self.plugin_result
--- a/node-daemon/plugins/ceph-cluster
+++ b/node-daemon/plugins/ceph-cluster
@@ -1,126 +0,0 @@
-#!/usr/bin/env python3
-
-# ceph-cluster.py - PVC Monitoring example plugin for Ceph status
-# Part of the Parallel Virtual Cluster (PVC) system
-#
-#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
-#
-#    This program is free software: you can redistribute it and/or modify
-#    it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation, version 3.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-#
-#    You should have received a copy of the GNU General Public License
-#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-#
-###############################################################################
-
-# This script provides an example of a PVC monitoring plugin script. It will create
-# a simple plugin to check the Ceph cluster health for anomalies, and return a health
-# delta reflective of the overall Ceph status (HEALTH_WARN = 10, HEALTH_ERR = 50).
-
-# This script can thus be used as an example or reference implementation of a
-# PVC monitoring pluginscript and expanded upon as required.
-
-# A monitoring plugin script must implement the class "MonitoringPluginScript" which
-# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
-# of the role of each function is provided in context of the example; see the other
-# examples for more potential uses.
-
-# WARNING:
-#
-# This script will run in the context of the node daemon keepalives as root.
-# DO NOT install untrusted, unvetted plugins under any circumstances.
-
-
-# This import is always required here, as MonitoringPlugin is used by the
-# MonitoringPluginScript class
-from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
-
-
-# A monitoring plugin script must always expose its nice name, which must be identical to
-# the file name
-PLUGIN_NAME = "ceph-cluster"
-
-
-# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
-class MonitoringPluginScript(MonitoringPlugin):
-    def setup(self):
-        """
-        setup(): Perform special setup steps during node daemon startup
-
-        This step is optional and should be used sparingly.
-        """
-
-        pass
-
-    def run(self):
-        """
-        run(): Perform the check actions and return a PluginResult object
-        """
-
-        # Run any imports first
-        from rados import Rados
-        from json import loads, dumps
-
-        # Connect to the Ceph cluster
-        try:
-            ceph_conn = Rados(
-                conffile=self.config["ceph_config_file"],
-                conf=dict(keyring=self.config["ceph_admin_keyring"]),
-            )
-            ceph_conn.connect(timeout=1)
-        except Exception as e:
-            self.log(f"Failed to connect to Ceph cluster: {e}", state="e")
-            return self.plugin_result
-            
-        # Get the Ceph cluster health
-        try:
-            health_status = loads(
-                ceph_conn.mon_command(dumps({"prefix": "health", "format": "json"}), b"", timeout=1)[1]
-            )
-            ceph_health = health_status["status"]
-        except Exception as e:
-            self.log(f"Failed to get health data from Ceph cluster: {e}", state="e")
-            return self.plugin_result
-        finally:
-            ceph_conn.shutdown()
-
-        # Get a list of error entries in the health status output
-        error_entries = health_status["checks"].keys()
-
-        # Set the health delta based on the errors presented
-        if ceph_health == "HEALTH_ERR":
-            health_delta = 50
-            message = f"Ceph cluster in ERROR state: {', '.join(error_entries)}"
-        elif ceph_health == "HEALTH_WARN":
-            health_delta = 10
-            message = f"Ceph cluster in WARNING state: {', '.join(error_entries)}"
-        else:
-            health_delta = 0
-            message = "Ceph cluster in OK state"
-
-        # Set the health delta in our local PluginResult object
-        self.plugin_result.set_health_delta(health_delta)
-
-        # Set the message in our local PluginResult object
-        self.plugin_result.set_message(message)
-
-        # Set the detailed data in our local PluginResult object
-        self.plugin_result.set_data(dumps(health_status))
-
-        # Return our local PluginResult object
-        return self.plugin_result
-
-    def cleanup(self):
-        """
-        cleanup(): Perform special cleanup steps during node daemon termination
-
-        This step is optional and should be used sparingly.
-        """
-
-        pass
--- a/node-daemon/plugins/dpkg
+++ b/node-daemon/plugins/dpkg
@@ -66,7 +66,6 @@ class MonitoringPluginScript(MonitoringPlugin):

        # Run any imports first
        from re import match
-        from json import dumps
        import daemon_lib.common as pvc_common

        # Get Debian version
@@ -143,7 +142,7 @@ class MonitoringPluginScript(MonitoringPlugin):
            "inconsistent_packages": list_inconsistent,
            "upgradable_packages": list_upgradable,
        }
-        self.plugin_result.set_data(dumps(detailed_data))
+        self.plugin_result.set_data(detailed_data)

        # Return our local PluginResult object
        return self.plugin_result
--- a/node-daemon/pvcnoded/objects/MonitoringInstance.py
+++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py
@@ -25,6 +25,7 @@ import importlib.util

 from os import walk
 from datetime import datetime
+from json import dumps


 class PluginResult(object):
@@ -37,7 +38,7 @@ class PluginResult(object):
        self.current_time = int(time.time())
        self.health_delta = 0
        self.message = None
-        self.data = None
+        self.data = {}
        self.runtime = "0.00"

    def set_health_delta(self, new_delta):
@@ -98,7 +99,7 @@ class PluginResult(object):
                        "monitoring_plugin.data",
                        self.plugin_name,
                    ),
-                    self.data,
+                    dumps(self.data),
                ),
                (
                    (
@@ -259,7 +260,7 @@ class MonitoringInstance(object):
                                "monitoring_plugin.data",
                                plugin.plugin_name,
                            ),
-                            None,
+                            dumps({}),
                        ),
                        (
                            (
@@ -286,7 +287,7 @@ class MonitoringInstance(object):
            [
                (
                    ("node.monitoring.plugins", self.this_node.name),
-                    self.all_plugin_names,
+                    " ".join(self.all_plugin_names),
                ),
            ]
        )
@@ -346,6 +347,14 @@ class MonitoringInstance(object):
        else:
            health_colour = self.logger.fmt_red

+        self.zkhandler.write(
+            [
+                (
+                    ("node.monitoring.health", self.this_node.name),
+                    total_health,
+                ),
+            ]
+        )
        self.logger.out(
            f"System health: {health_colour}{total_health}/100{self.logger.fmt_end}",
            state="t",