Split health monitoring into discrete daemon/pkg

2023-11-29 15:36:49 -05:00
parent 74a416165d
commit 41f4e4fb2f
40 changed files with 3032 additions and 33 deletions
--- a/health-daemon/daemon_lib
+++ b/health-daemon/daemon_lib
@@ -0,0 +1 @@
+../daemon-common
--- a/health-daemon/plugins/disk
+++ b/health-daemon/plugins/disk
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+
+# disk.py - PVC Monitoring example plugin for disk (system + OSD)
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the system and OSD disks for errors and faults and return
+# a health delta corresponding to severity.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "disk"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        from daemon_lib.common import run_os_command
+        from json import loads
+
+        _, _all_disks, _ = run_os_command("lsblk --json --paths --include 8,259")
+        try:
+            all_disks = loads(_all_disks)
+        except Exception as e:
+            return f"Error loading lsblk JSON: {e}"
+
+        disk_details = list()
+
+        def get_smartinfo(disk, extra_opt=""):
+            _, _smart_info, _ = run_os_command(f"smartctl --info --json {extra_opt} {disk}")
+            try:
+                smart_info = loads(_smart_info)
+            except Exception as e:
+                return None
+
+            return smart_info
+
+        for disk in [disk["name"] for disk in all_disks['blockdevices']]:
+            extra_opt = ""
+            smart_info = get_smartinfo(disk)
+            if smart_info is None or smart_info["smartctl"]["exit_status"] > 1:
+                continue
+            elif smart_info["smartctl"]["exit_status"] == 1:
+                if "requires option" in smart_info["smartctl"]["messages"][0]["string"]:
+                    extra_opt = smart_info["smartctl"]["messages"][0]["string"].split("'")[1].replace('N','0')
+                    smart_info = get_smartinfo(disk, extra_opt)
+                    if smart_info is None or smart_info["smartctl"]["exit_status"] > 0:
+                        continue
+                else:
+                    continue
+
+            disk_type = smart_info["device"]["type"]
+
+            disk_details.append((disk, extra_opt, disk_type))
+
+        self.disk_details = disk_details
+
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Re-run setup each time to ensure the disk details are current
+        self.setup()
+
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from json import loads
+
+        health_delta = 0
+        messages = list()
+
+        for _disk in self.disk_details:
+            disk = _disk[0]
+            extra_opt = _disk[1]
+            disk_type = _disk[2]
+
+            _, _smart_info, _ = run_os_command(f"smartctl --all --json {extra_opt} {disk}")
+            try:
+                smart_info = loads(_smart_info)
+            except Exception as e:
+                health_delta += 10
+                messages.append(f"{disk} failed to load SMART data")
+                continue
+
+            if disk_type == 'nvme':
+                for attribute in smart_info.get('nvme_smart_health_information_log', {}).items():
+                    if attribute[0] == "critical_warning" and attribute[1] > 0:
+                        health_delta += 10
+                        messages.append(f"{disk} critical warning value {attribute[1]}")
+                    if attribute[0] == "media_errors" and attribute[1] > 0:
+                        health_delta += 10
+                        messages.append(f"{disk} media errors value {attribute[1]}")
+                    if attribute[0] == "percentage_used" and attribute[1] > 90:
+                        health_delta += 10
+                        messages.append(f"{disk} percentage used value {attribute[1]}%")
+            else:
+                for attribute in smart_info.get('ata_smart_attributes', {}).get('table', []):
+                    if attribute["when_failed"]:
+                        health_delta += 10
+                        messages.append(f"{disk} attribute {attribute['name']} value {attribute['raw']['value']}")
+
+        if len(messages) < 1:
+            messages.append(f"All {len(self.disk_details)} checked disks report OK: {', '.join([disk[0] for disk in self.disk_details])}")
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(', '.join(messages))
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/dpkg
+++ b/health-daemon/plugins/dpkg
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+
+# dpkg.py - PVC Monitoring example plugin for dpkg status
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the system dpkg status is as expected, with no invalid
+# packages or obsolete configuration files, and will return a 1 health delta for each
+# flaw in invalid packages, upgradable packages, and obsolete config files.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "dpkg"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from re import match
+        import daemon_lib.common as pvc_common
+
+        # Get Debian version
+        with open('/etc/debian_version', 'r') as fh:
+            debian_version = fh.read().strip()
+
+        # Get a list of dpkg packages for analysis
+        retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/dpkg --list")
+
+        # Get a list of installed packages and states
+        packages = list()
+        for dpkg_line in stdout.split('\n'):
+            if match('^[a-z][a-z] ', dpkg_line):
+                line_split = dpkg_line.split()
+                package_state = line_split[0]
+                package_name = line_split[1]
+                packages.append((package_name, package_state))
+
+        count_ok = 0
+        count_inconsistent = 0
+        list_inconsistent = list()
+
+        for package in packages:
+            if package[1] == "ii":
+                count_ok += 1
+            else:
+                count_inconsistent += 1
+                list_inconsistent.append(package[0])
+
+        # Get upgradable packages
+        retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/apt list --upgradable")
+
+        list_upgradable = list()
+        for apt_line in stdout.split('\n'):
+            if match('^[a-z][a-z] ', apt_line):
+                line_split = apt_line.split('/')
+                package_name = line_split[0]
+                list_upgradable.append(package_name)
+
+        count_upgradable = len(list_upgradable)
+
+        # Get obsolete config files (dpkg-*, ucf-*, or update-* under /etc)
+        retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/find /etc -type f -a \( -name '*.dpkg-*' -o -name '*.ucf-*' -o -name '*.update-*' \)")
+
+        obsolete_conffiles = list()
+        for conffile_line in stdout.split('\n'):
+            if conffile_line:
+                obsolete_conffiles.append(conffile_line)
+
+        count_obsolete_conffiles = len(obsolete_conffiles)
+
+        # Set health_delta based on the results
+        health_delta = 0
+        if count_inconsistent > 0:
+            health_delta += 1
+        if count_upgradable > 0:
+            health_delta += 1
+        if count_obsolete_conffiles > 0:
+            health_delta += 1
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Craft the message
+        message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Set the detailed data in our local PluginResult object
+        detailed_data = {
+            "debian_version": debian_version,
+            "obsolete_conffiles": obsolete_conffiles,
+            "inconsistent_packages": list_inconsistent,
+            "upgradable_packages": list_upgradable,
+        }
+        self.plugin_result.set_data(detailed_data)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/edac
+++ b/health-daemon/plugins/edac
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# edac.py - PVC Monitoring example plugin for EDAC
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the system's EDAC registers and report any failures.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "edac"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        import daemon_lib.common as common
+        from re import match, search
+
+        # Get edac-util output
+        retcode, stdout, stderr = common.run_os_command('/usr/bin/edac-util')
+
+        # If there's no errors, we're OK
+        if match(r'^edac-util: No errors to report.', stdout):
+            health_delta = 0
+            message = "EDAC reports no errors"
+        else:
+            health_delta = 0
+            message = "EDAC reports errors: "
+            errors = list()
+            for line in stdout.split('\n'):
+                if match(r'^mc[0-9]: csrow', line):
+                    if 'Uncorrected' in line:
+                        health_delta = 50
+                    errors.append(' '.join(line.split()[2:]))
+            message += ', '.join(errors)
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/hwrd
+++ b/health-daemon/plugins/hwrd
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+
+# hwrd.py - PVC Monitoring example plugin for hardware RAID Arrays
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check any hardwrae RAID virtual disks for health and report errors.
+# Supports Dell BOSS cards, LSI/Avago/Broadcom MegaRAID, and HP SmartArray RAID.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "hwrd"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def check_dellboss(self):
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match
+
+        health_delta = 0
+        messages = list()
+
+        _dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
+        if _dellboss_ret != 0:
+            health_delta = 50
+            messages.append("Error running MVCLI command")
+        else:
+            arrays = list()
+            idx = None
+
+            for line in _dellboss_list.split('\n'):
+                if match(r"^id:", line):
+                    idx = int(line.split(":")[-1].strip())
+                    arrays.append(dict())
+                if match(r"^name:", line):
+                    arrays[idx]["name"] = line.split(":")[-1].strip()
+                if match(r"^status:", line):
+                    arrays[idx]["status"] = line.split(":")[-1].strip()
+
+            for idx, array in enumerate(arrays):
+                if array["status"] != "functional":
+                    health_delta += 50
+                messages.append(f"RAID Dell BOSS ID {idx} (Name: {array['name']}, State: {array['status']})")
+                    
+        if len(messages) < 1:
+            messages.append(f"No valid RAID arrays found")
+
+        return health_delta, messages
+
+    def check_megaraid(self):
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match
+
+        health_delta = 0
+        messages = list()
+
+        _megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
+        if _megaraid_ret != 0:
+            health_delta = 50
+            messages.append("Error running MegaCLI command")
+        else:
+            vd_list = _megaraid_list.split('\n\n\n')
+            for idx, _vd in enumerate(vd_list):
+                vd = _vd.split('\n')
+                if "Virtual Drive Information" not in vd[2]:
+                    continue
+
+                raid_name = None
+                raid_count = 0
+                raid_state = None
+
+                for entry in vd:
+                    if len(entry.split(':')) < 2:
+                        continue
+
+                    entry_key = entry.split(':')[0].strip()
+                    entry_value = entry.split(':')[1].strip()
+
+                    if entry_key == "State":
+                        raid_state = entry_value
+                    if entry_key == "Name":
+                        raid_name = entry_value
+                    if entry_key == "Number Of Drives":
+                        raid_count = entry_value
+
+                if raid_state is None or raid_name is None or raid_count == 0:
+                    health_delta += 10
+                    messages.append(f"RAID ID {idx} did not report useful values")
+                    continue
+
+                if raid_state != "Optimal":
+                    health_delta += 50
+                messages.append(f"RAID MegaRAID ID {idx} (Name: {raid_name}, Disks: {raid_count}, State: {raid_state})")
+                        
+        if len(messages) < 1:
+            messages.append(f"No valid RAID arrays found")
+
+        return health_delta, messages
+
+    def check_hpsa(self):
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match, findall
+
+        health_delta = 0
+        messages = list()
+
+        _hparray_ret, _hparray_list, _ = run_os_command(f"ssacli ctrl slot={self.controller_slot} ld all show")
+        if _hparray_ret != 0:
+            health_delta = 50
+            messages.append("Error running SSACLI command")
+        else:
+            vd_lines = _hparray_list.split('\n\n')
+
+            arrays = list()
+            cur_array = None
+            for idx, _line in enumerate(vd_lines):
+                line = _line.strip()
+                if match(r"^Array", line):
+                    cur_array = line
+                if match(r"^logicaldrive", line) and cur_array is not None:
+                    arrays.append(f"{cur_array} {line}")
+
+            for array in arrays:
+                if "OK" not in array:
+                    health_delta += 50
+                messages.append(f"RAID HPSA {array}")
+                        
+        if len(messages) < 1:
+            messages.append(f"No valid RAID arrays found")
+
+        return health_delta, messages
+
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        from daemon_lib.common import run_os_command
+        from re import match, findall
+
+        self.raid_type = list()
+
+        _dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
+        if _dellboss_ret == 0:
+            # If this returns 0 at all, there's a valid BOSS card to manage
+            self.raid_type.append("dellboss")
+
+        _megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
+        if _megaraid_ret == 0:
+            vd_list = _megaraid_list.split('\n\n\n')
+            for idx, _vd in enumerate(vd_list):
+                vd = _vd.split('\n')
+                if "Virtual Drive Information" in vd[2]:
+                    self.raid_type.append("megaraid")
+
+        _hpraid_ret, _hpraid_list, _ = run_os_command("ssacli ctrl all show status")
+        if _hpraid_ret == 0:
+            for line in _hpraid_list.split('\n'):
+                if match(r"^Smart", line):
+                    controller_slots = findall("Slot ([0-9])", line)
+                    if len(controller_slots) > 0:
+                        self.raid_type.append("hpsa")
+                        self.controller_slot = controller_slots[0]
+
+        if len(self.raid_type) < 1:
+            return "No hardware RAID management commands found"
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+        """
+
+        health_delta = 0
+        messages = list()
+
+        raid_function_map = {
+            "megaraid": self.check_megaraid,
+            "hpsa": self.check_hpsa,
+            "dellboss": self.check_dellboss,
+        }
+
+        for raid_type in self.raid_type:
+            _health_delta, _messages = raid_function_map.get(raid_type)()
+            health_delta += _health_delta
+            messages += _messages
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(', '.join(messages))
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/ipmi
+++ b/health-daemon/plugins/ipmi
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# ipmi.py - PVC Monitoring example plugin for IPMI
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check whether the system IPMI is reachable.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "ipmi"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not ipmi in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+
+        # Check the node's IPMI interface
+        ipmi_hostname = self.config["ipmi_hostname"]
+        ipmi_username = self.config["ipmi_username"]
+        ipmi_password = self.config["ipmi_password"]
+        retcode, _, _ = run_os_command(
+            f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status",
+            timeout=5
+        )
+
+        if retcode > 0:
+            # Set the health delta to 10 (subtract 10 from the total of 100)
+            health_delta = 10
+            # Craft a message that can be used by the clients
+            message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding"
+        else:
+            # Set the health delta to 0 (no change)
+            health_delta = 0
+            # Craft a message that can be used by the clients
+            message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding"
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/kydb
+++ b/health-daemon/plugins/kydb
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# kydb.py - PVC Monitoring example plugin for KeyDB/Redis
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the Libvirt daemon instance on the node for operation.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "kydb"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not kydb in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from redis import Redis
+
+        rd_conn = None
+
+        # Set the health delta to 0 (no change)
+        health_delta = 0
+        # Craft a message that can be used by the clients
+        message = "Successfully connected to Libvirtd on localhost"
+
+        # Check the Zookeeper connection
+        try:
+            rd_conn = Redis(host='localhost', port=6379, decode_responses=True)
+            data = rd_conn.info()
+        except Exception as e:
+            health_delta = 50
+            message = f"Failed to connect to KeyDB/Redis: {e}"
+        finally:
+            del rd_conn
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/lbvt
+++ b/health-daemon/plugins/lbvt
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+# lbvt.py - PVC Monitoring example plugin for Libvirtd
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the Libvirt daemon instance on the node for operation.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "lbvt"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not lbvt in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from libvirt import openReadOnly as lvopen
+
+        lv_conn = None
+
+        # Set the health delta to 0 (no change)
+        health_delta = 0
+        # Craft a message that can be used by the clients
+        message = "Successfully connected to Libvirtd on localhost"
+
+        # Check the Zookeeper connection
+        try:
+            lv_conn = lvopen(f"qemu+tcp://{self.this_node.name}/system")
+            data = lv_conn.getHostname()
+        except Exception as e:
+            health_delta = 50
+            message = f"Failed to connect to Libvirtd: {e}"
+        finally:
+            if lv_conn is not None:
+                lv_conn.close()
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/load
+++ b/health-daemon/plugins/load
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# load.py - PVC Monitoring example plugin for load
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the system load against the total number of CPU cores.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "load"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from os import getloadavg
+        from psutil import cpu_count
+
+        # Get the current 1-minute system load average
+        load_average = float(round(getloadavg()[0], 2))
+
+        # Get the number of CPU cores
+        cpu_cores = cpu_count()
+
+        # Check that the load average is greater or equal to the cpu count
+        if load_average > float(cpu_cores):
+            # Set the health delta to 10 (subtract 10 from the total of 100)
+            health_delta = 50
+            # Craft a message that can be used by the clients
+            message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
+
+        else:
+            # Set the health delta to 0 (no change)
+            health_delta = 0
+            # Craft a message that can be used by the clients
+            message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/nics
+++ b/health-daemon/plugins/nics
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+
+# nics.py - PVC Monitoring example plugin for NIC interfaces
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the network interfaces of the host, specifically for speed
+# and 802.3ad status (if applicable).
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "nics"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        import daemon_lib.common as common
+        from re import match, search, findall
+
+        messages = list()
+        health_delta = 0
+
+        # Get a list of the various underlying devices
+        _core_nics = set()
+
+        for dev in [
+                self.config['bridge_dev'],
+                self.config['upstream_dev'],
+                self.config['cluster_dev'],
+                self.config['storage_dev'],
+        ]:
+            with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
+                _devtype = uevent.readlines()[0].split('=')[-1].strip()
+
+            if _devtype == 'vlan':
+                with open(f"/proc/net/vlan/{dev}") as devfh:
+                    vlan_info = devfh.read().split('\n')
+                for line in vlan_info:
+                    if match(r'^Device:', line):
+                        dev = line.split()[-1]
+
+            _core_nics.add(dev)
+
+        core_nics = sorted(list(_core_nics))
+
+        for dev in core_nics:
+            with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
+                _devtype = uevent.readlines()[0].split('=')[-1].strip()
+
+            if _devtype == "bond":
+                syspath = f"/proc/net/bonding/{dev}"
+
+                with open(syspath) as devfh:
+                    bonding_stats = devfh.read()
+
+                _, _mode, _info, *_slaves = bonding_stats.split('\n\n')
+
+                slave_interfaces = list()
+                for slavedev in _slaves:
+                    lines = slavedev.split('\n')
+                    for line in lines:
+                        if match(r'^Slave Interface:', line):
+                            interface_name = line.split()[-1]
+                        if match(r'^MII Status:', line):
+                            interface_status = line.split()[-1]
+                        if match(r'^Speed:', line):
+                            try:
+                                interface_speed_mbps = int(line.split()[-2])
+                            except Exception:
+                                interface_speed_mbps = 0
+                        if match(r'^Duplex:', line):
+                            interface_duplex = line.split()[-1]
+                    slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex))
+
+                # Ensure at least 2 slave interfaces are up
+                slave_interface_up_count = 0
+                for slave_interface in slave_interfaces:
+                    if slave_interface[1] == 'up':
+                        slave_interface_up_count += 1
+                if slave_interface_up_count < len(slave_interfaces):
+                    messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves")
+                    health_delta += 10
+                else:
+                    messages.append(f"{dev} OK with {slave_interface_up_count} active slaves")
+
+                # Get ethtool supported speeds for slave interfaces
+                supported_link_speeds = set()
+                for slave_interface in slave_interfaces:
+                    slave_dev = slave_interface[0]
+                    _, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}")
+                    in_modes = False
+                    for line in ethtool_stdout.split('\n'):
+                        if search('Supported link modes:', line):
+                            in_modes = True
+                        if search('Supported pause frame use:', line):
+                            in_modes = False
+                            break
+                        if in_modes:
+                            speed = int(findall(r'\d+', line.split()[-1])[0])
+                            supported_link_speeds.add(speed)
+            else:
+                # Get ethtool supported speeds for interface
+                supported_link_speeds = set()
+                _, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}")
+                in_modes = False
+                for line in ethtool_stdout.split('\n'):
+                    if search('Supported link modes:', line):
+                        in_modes = True
+                    if search('Supported pause frame use:', line):
+                        in_modes = False
+                        break
+                    if in_modes:
+                        speed = int(line.split()[-1].replace('baseT', '').split('/')[0])
+                        supported_link_speeds.add(speed)
+
+            max_supported_link_speed = sorted(list(supported_link_speeds))[-1]
+
+            # Ensure interface is running at its maximum speed
+            with open(f"/sys/class/net/{dev}/speed") as devfh:
+                dev_speed = int(devfh.read())
+            if dev_speed < max_supported_link_speed:
+                messages.append(f"{dev} DEGRADED at {dev_speed} Mbps")
+                health_delta += 10
+            else:
+                messages.append(f"{dev} OK at {dev_speed} Mbps")
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(', '.join(messages))
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/psql
+++ b/health-daemon/plugins/psql
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+# psql.py - PVC Monitoring example plugin for Postgres/Patroni
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the Patroni PostgreSQL instance on the node for operation.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "psql"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from psycopg2 import connect
+
+        conn_metadata = None
+        cur_metadata = None
+        conn_dns = None
+        cur_dns = None
+
+        # Set the health delta to 0 (no change)
+        health_delta = 0
+        # Craft a message that can be used by the clients
+        message = "Successfully connected to PostgreSQL databases on localhost"
+
+        # Check the Metadata database (primary)
+        try:
+            conn_metadata = connect(
+                host=self.this_node.name,
+                port=self.config["metadata_postgresql_port"],
+                dbname=self.config["metadata_postgresql_dbname"],
+                user=self.config["metadata_postgresql_user"],
+                password=self.config["metadata_postgresql_password"],
+            )
+            cur_metadata = conn_metadata.cursor()
+            cur_metadata.execute("""SELECT * FROM alembic_version""")
+            data = cur_metadata.fetchone()
+        except Exception as e:
+            health_delta = 50
+            err = str(e).split('\n')[0]
+            message = f"Failed to connect to PostgreSQL database {self.config['metadata_postgresql_dbname']}: {err}"
+        finally:
+            if cur_metadata is not None:
+                cur_metadata.close()
+            if conn_metadata is not None:
+                conn_metadata.close()
+
+        if health_delta == 0:
+            # Check the PowerDNS database (secondary)
+            try:
+                conn_pdns = connect(
+                    host=self.this_node.name,
+                    port=self.config["pdns_postgresql_port"],
+                    dbname=self.config["pdns_postgresql_dbname"],
+                    user=self.config["pdns_postgresql_user"],
+                    password=self.config["pdns_postgresql_password"],
+                )
+                cur_pdns = conn_pdns.cursor()
+                cur_pdns.execute("""SELECT * FROM supermasters""")
+                data = cur_pdns.fetchone()
+            except Exception as e:
+                health_delta = 50
+                err = str(e).split('\n')[0]
+                message = f"Failed to connect to PostgreSQL database {self.config['pdns_postgresql_dbname']}: {err}"
+            finally:
+                if cur_pdns is not None:
+                    cur_pdns.close()
+                if conn_pdns is not None:
+                    conn_pdns.close()
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/psur
+++ b/health-daemon/plugins/psur
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+
+# psur.py - PVC Monitoring example plugin for PSU Redundancy
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check IPMI for power supply reundancy status.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "psur"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match
+
+        _ipmitool_ret, _ipmitool_list, _ = run_os_command("ipmitool sdr type 'Power Supply'")
+        if _ipmitool_ret != 0:
+            return "Error running ipmitool command"
+        else:
+            search_values = [
+                "PS Redundancy",  # Dell PowerEdge
+                "Power Supplies", # HP ProLiant
+                "PS_RDNDNT_MODE", # Cisco UCS
+            ]
+            reading_lines = [l for l in _ipmitool_list.split('\n') if len(l.split('|')) > 0 and l.split('|')[0].strip() in search_values]
+            if len(reading_lines) < 1:
+                return "No valid input power sensors found"
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match
+
+        health_delta = 0
+        messages = list()
+
+        _ipmitool_ret, _ipmitool_list, _ = run_os_command("ipmitool sdr type 'Power Supply'")
+        if _ipmitool_ret != 0 or len(_ipmitool_list.split('\n')) < 1:
+            health_delta = 0
+            messages.append("Error running ipmitool command")
+        else:
+            search_values = [
+                "PS Redundancy",  # Dell PowerEdge
+                "Power Supplies", # HP ProLiant
+                "PS_RDNDNT_MODE", # Cisco UCS
+            ]
+
+            reading_lines = [l for l in _ipmitool_list.split('\n') if len(l.split('|')) > 0 and l.split('|')[0].strip() in search_values]
+            if len(reading_lines) > 0:
+                for reading_line in reading_lines:
+                    reading_sensor = reading_line.split('|')[1].strip()
+                    reading_text = reading_line.split('|')[-1].strip()
+
+                    if reading_text == "Fully Redundant":
+                        health_delta += 0
+                        messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
+                    elif reading_text == "No Reading":
+                        health_delta += 5
+                        messages.append(f"Input power sensor {reading_sensor} reports {reading_text} (PSU redundancy not configured?)")
+                    else:
+                        health_delta += 10
+                        messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
+            else:
+                health_delta = 5
+                messages.append("No valid input power sensors found, but configured")
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(', '.join(messages))
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/plugins/zkpr
+++ b/health-daemon/plugins/zkpr
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+# zkpr.py - PVC Monitoring example plugin for Zookeeper
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check the Zookeeper instance on the node for operation.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "zkpr"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not zkpr in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+
+        # Run any imports first
+        from kazoo.client import KazooClient, KazooState
+
+        zk_conn = None
+
+        # Set the health delta to 0 (no change)
+        health_delta = 0
+        # Craft a message that can be used by the clients
+        message = "Successfully connected to Zookeeper on localhost"
+
+        # Check the Zookeeper connection
+        try:
+            zk_conn = KazooClient(hosts=[f"{self.this_node.name}:2181"], timeout=1, read_only=True)
+            zk_conn.start(timeout=1)
+            data = zk_conn.get('/schema/version')
+        except Exception as e:
+            health_delta = 50
+            message = f"Failed to connect to Zookeeper: {e}"
+        finally:
+            if zk_conn is not None:
+                zk_conn.stop()
+                zk_conn.close()
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(message)
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/health-daemon/pvchealthd.py
+++ b/health-daemon/pvchealthd.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+# pvchealthd.py - Health daemon startup stub
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import pvchealthd.Daemon  # noqa: F401
+
+pvchealthd.Daemon.entrypoint()
--- a/health-daemon/pvchealthd.service
+++ b/health-daemon/pvchealthd.service
@@ -0,0 +1,20 @@
+# Parallel Virtual Cluster health daemon unit file
+
+[Unit]
+Description = Parallel Virtual Cluster health daemon
+After = network.target
+Wants = network-online.target
+PartOf = pvc.target
+
+[Service]
+Type = simple
+WorkingDirectory = /usr/share/pvc
+Environment = PYTHONUNBUFFERED=true
+Environment = PVC_CONFIG_FILE=/etc/pvc/pvc.conf
+ExecStartPre = /bin/sleep 2
+ExecStart = /usr/share/pvc/pvchealthd.py
+ExecStopPost = /bin/sleep 2
+Restart = on-failure
+
+[Install]
+WantedBy = pvc.target
--- a/health-daemon/pvchealthd/Daemon.py
+++ b/health-daemon/pvchealthd/Daemon.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+# Daemon.py - Health daemon main entrypoing
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import pvchealthd.util.config
+import pvchealthd.util.zookeeper
+
+import pvchealthd.objects.MonitoringInstance as MonitoringInstance
+import pvchealthd.objects.NodeInstance as NodeInstance
+
+import daemon_lib.log as log
+
+from time import sleep
+
+import os
+import signal
+
+# Daemon version
+version = "0.9.82"
+
+
+##########################################################
+# Entrypoint
+##########################################################
+
+
+def entrypoint():
+    monitoring_instance = None
+
+    # Get our configuration
+    config = pvchealthd.util.config.get_configuration()
+    config["daemon_name"] = "pvchealthd"
+    config["daemon_version"] = version
+
+    # Set up the logger instance
+    logger = log.Logger(config)
+
+    # Print our startup message
+    logger.out("")
+    logger.out("|------------------------------------------------------------|")
+    logger.out("|                                                            |")
+    logger.out("|            ███████████ ▜█▙      ▟█▛ █████ █ █ █            |")
+    logger.out("|                     ██  ▜█▙    ▟█▛  ██                     |")
+    logger.out("|            ███████████   ▜█▙  ▟█▛   ██                     |")
+    logger.out("|            ██             ▜█▙▟█▛    ███████████            |")
+    logger.out("|                                                            |")
+    logger.out("|------------------------------------------------------------|")
+    logger.out("| Parallel Virtual Cluster health daemon v{0: <18} |".format(version))
+    logger.out("| Debug: {0: <51} |".format(str(config["debug"])))
+    logger.out("| FQDN: {0: <52} |".format(config["node_fqdn"]))
+    logger.out("| Host: {0: <52} |".format(config["node_hostname"]))
+    logger.out("| ID: {0: <54} |".format(config["node_id"]))
+    logger.out("| IPMI hostname: {0: <43} |".format(config["ipmi_hostname"]))
+    logger.out("| Machine details:                                           |")
+    logger.out("|   CPUs: {0: <50} |".format(config["static_data"][0]))
+    logger.out("|   Arch: {0: <50} |".format(config["static_data"][3]))
+    logger.out("|   OS: {0: <52} |".format(config["static_data"][2]))
+    logger.out("|   Kernel: {0: <48} |".format(config["static_data"][1]))
+    logger.out("|------------------------------------------------------------|")
+    logger.out("")
+    logger.out(f'Starting pvchealthd on host {config["node_fqdn"]}', state="s")
+
+    # Connect to Zookeeper and return our handler and current schema version
+    zkhandler, _ = pvchealthd.util.zookeeper.connect(logger, config)
+
+    # Define a cleanup function
+    def cleanup(failure=False):
+        nonlocal logger, zkhandler, monitoring_instance
+
+        logger.out("Terminating pvchealthd and cleaning up", state="s")
+
+        # Shut down the monitoring system
+        try:
+            logger.out("Shutting down monitoring subsystem", state="s")
+            monitoring_instance.shutdown()
+        except Exception:
+            pass
+
+        # Close the Zookeeper connection
+        try:
+            zkhandler.disconnect(persistent=True)
+            del zkhandler
+        except Exception:
+            pass
+
+        logger.out("Terminated health daemon", state="s")
+        logger.terminate()
+
+        if failure:
+            retcode = 1
+        else:
+            retcode = 0
+
+        os._exit(retcode)
+
+    # Termination function
+    def term(signum="", frame=""):
+        cleanup(failure=False)
+
+    # Hangup (logrotate) function
+    def hup(signum="", frame=""):
+        if config["file_logging"]:
+            logger.hup()
+
+    # Handle signals gracefully
+    signal.signal(signal.SIGTERM, term)
+    signal.signal(signal.SIGINT, term)
+    signal.signal(signal.SIGQUIT, term)
+    signal.signal(signal.SIGHUP, hup)
+
+    this_node = NodeInstance.NodeInstance(
+        config["node_hostname"],
+        zkhandler,
+        config,
+        logger,
+    )
+
+    # Set up the node monitoring instance and thread
+    monitoring_instance = MonitoringInstance.MonitoringInstance(
+        zkhandler, config, logger, this_node
+    )
+
+    # Tick loop; does nothing since everything is async
+    while True:
+        try:
+            sleep(1)
+        except Exception:
+            break
--- a/health-daemon/pvchealthd/init.py
+++ b/health-daemon/pvchealthd/init.py
--- a/health-daemon/pvchealthd/objects/MonitoringInstance.py
+++ b/health-daemon/pvchealthd/objects/MonitoringInstance.py
@@ -0,0 +1,484 @@
+#!/usr/bin/env python3
+
+# MonitoringInstance.py - Class implementing a PVC monitor in pvchealthd
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import concurrent.futures
+import time
+import importlib.util
+
+from os import walk
+from datetime import datetime
+from json import dumps
+from apscheduler.schedulers.background import BackgroundScheduler
+
+
+class PluginError(Exception):
+    """
+    An exception that results from a plugin failing setup
+    """
+
+    pass
+
+
+class PluginResult(object):
+    def __init__(self, zkhandler, config, logger, this_node, plugin_name):
+        self.zkhandler = zkhandler
+        self.config = config
+        self.logger = logger
+        self.this_node = this_node
+        self.plugin_name = plugin_name
+        self.current_time = int(time.time())
+        self.health_delta = 0
+        self.message = "N/A"
+        self.data = {}
+        self.runtime = "0.00"
+
+    def set_health_delta(self, new_delta):
+        self.health_delta = new_delta
+
+    def set_message(self, new_message):
+        self.message = new_message
+
+    def set_data(self, new_data):
+        self.data = new_data
+
+    def set_runtime(self, new_runtime):
+        self.runtime = new_runtime
+
+    def to_zookeeper(self):
+        self.zkhandler.write(
+            [
+                (
+                    (
+                        "node.monitoring.data",
+                        self.this_node.name,
+                        "monitoring_plugin.name",
+                        self.plugin_name,
+                    ),
+                    self.plugin_name,
+                ),
+                (
+                    (
+                        "node.monitoring.data",
+                        self.this_node.name,
+                        "monitoring_plugin.last_run",
+                        self.plugin_name,
+                    ),
+                    self.current_time,
+                ),
+                (
+                    (
+                        "node.monitoring.data",
+                        self.this_node.name,
+                        "monitoring_plugin.health_delta",
+                        self.plugin_name,
+                    ),
+                    self.health_delta,
+                ),
+                (
+                    (
+                        "node.monitoring.data",
+                        self.this_node.name,
+                        "monitoring_plugin.message",
+                        self.plugin_name,
+                    ),
+                    self.message,
+                ),
+                (
+                    (
+                        "node.monitoring.data",
+                        self.this_node.name,
+                        "monitoring_plugin.data",
+                        self.plugin_name,
+                    ),
+                    dumps(self.data),
+                ),
+                (
+                    (
+                        "node.monitoring.data",
+                        self.this_node.name,
+                        "monitoring_plugin.runtime",
+                        self.plugin_name,
+                    ),
+                    self.runtime,
+                ),
+            ]
+        )
+
+
+class MonitoringPlugin(object):
+    def __init__(self, zkhandler, config, logger, this_node, plugin_name):
+        self.zkhandler = zkhandler
+        self.config = config
+        self.logger = logger
+        self.this_node = this_node
+        self.plugin_name = plugin_name
+
+        self.plugin_result = PluginResult(
+            self.zkhandler,
+            self.config,
+            self.logger,
+            self.this_node,
+            self.plugin_name,
+        )
+
+    def __str__(self):
+        return self.plugin_name
+
+    #
+    # Helper functions; exposed to child MonitoringPluginScript instances
+    #
+    def log(self, message, state="d"):
+        """
+        Log a message to the PVC logger instance using the plugin name as a prefix
+        Takes "state" values as defined by the PVC logger instance, defaulting to debug:
+            "d": debug
+            "i": informational
+            "t": tick/keepalive
+            "w": warning
+            "e": error
+        """
+        if state == "d" and not self.config["debug"]:
+            return
+
+        self.logger.out(message, state=state, prefix=self.plugin_name)
+
+    #
+    # Primary class functions; implemented by the individual plugins
+    #
+    def setup(self):
+        """
+        setup(): Perform setup of the plugin; run once during daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+        pass
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Run the plugin, returning a PluginResult object
+
+        The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
+        """
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Clean up after the plugin; run once during daemon shutdown
+        OPTIONAL
+        """
+        pass
+
+
+class MonitoringInstance(object):
+    def __init__(self, zkhandler, config, logger, this_node):
+        self.zkhandler = zkhandler
+        self.config = config
+        self.logger = logger
+        self.this_node = this_node
+
+        # Get a list of plugins from the plugin_directory
+        plugin_files = next(walk(self.config["plugin_directory"]), (None, None, []))[
+            2
+        ]  # [] if no file
+
+        self.all_plugins = list()
+        self.all_plugin_names = list()
+
+        successful_plugins = 0
+
+        # Load each plugin file into the all_plugins list
+        for plugin_file in sorted(plugin_files):
+            try:
+                self.logger.out(
+                    f"Loading monitoring plugin from {self.config['plugin_directory']}/{plugin_file}",
+                    state="i",
+                )
+                loader = importlib.machinery.SourceFileLoader(
+                    "plugin_script", f"{self.config['plugin_directory']}/{plugin_file}"
+                )
+                spec = importlib.util.spec_from_loader(loader.name, loader)
+                plugin_script = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(plugin_script)
+
+                plugin = plugin_script.MonitoringPluginScript(
+                    self.zkhandler,
+                    self.config,
+                    self.logger,
+                    self.this_node,
+                    plugin_script.PLUGIN_NAME,
+                )
+
+                failed_setup = plugin.setup()
+                if failed_setup is not None:
+                    raise PluginError(f"{failed_setup}")
+
+                # Create plugin key
+                self.zkhandler.write(
+                    [
+                        (
+                            (
+                                "node.monitoring.data",
+                                self.this_node.name,
+                                "monitoring_plugin.name",
+                                plugin.plugin_name,
+                            ),
+                            plugin.plugin_name,
+                        ),
+                        (
+                            (
+                                "node.monitoring.data",
+                                self.this_node.name,
+                                "monitoring_plugin.last_run",
+                                plugin.plugin_name,
+                            ),
+                            "0",
+                        ),
+                        (
+                            (
+                                "node.monitoring.data",
+                                self.this_node.name,
+                                "monitoring_plugin.health_delta",
+                                plugin.plugin_name,
+                            ),
+                            "0",
+                        ),
+                        (
+                            (
+                                "node.monitoring.data",
+                                self.this_node.name,
+                                "monitoring_plugin.message",
+                                plugin.plugin_name,
+                            ),
+                            "Initializing",
+                        ),
+                        (
+                            (
+                                "node.monitoring.data",
+                                self.this_node.name,
+                                "monitoring_plugin.data",
+                                plugin.plugin_name,
+                            ),
+                            dumps({}),
+                        ),
+                        (
+                            (
+                                "node.monitoring.data",
+                                self.this_node.name,
+                                "monitoring_plugin.runtime",
+                                plugin.plugin_name,
+                            ),
+                            "0.00",
+                        ),
+                    ]
+                )
+
+                self.all_plugins.append(plugin)
+                self.all_plugin_names.append(plugin.plugin_name)
+                successful_plugins += 1
+
+                self.logger.out(
+                    f"Successfully loaded monitoring plugin '{plugin.plugin_name}'",
+                    state="o",
+                )
+            except Exception as e:
+                self.logger.out(
+                    f"Failed to load monitoring plugin: {e}",
+                    state="w",
+                )
+
+        self.zkhandler.write(
+            [
+                (
+                    ("node.monitoring.plugins", self.this_node.name),
+                    " ".join(self.all_plugin_names),
+                ),
+            ]
+        )
+
+        if successful_plugins < 1:
+            return
+
+        # Clean up any old plugin data for which a plugin file no longer exists
+        plugins_data = self.zkhandler.children(
+            ("node.monitoring.data", self.this_node.name)
+        )
+        if plugins_data is not None:
+            for plugin_key in plugins_data:
+                if plugin_key not in self.all_plugin_names:
+                    self.zkhandler.delete(
+                        (
+                            "node.monitoring.data",
+                            self.this_node.name,
+                            "monitoring_plugin",
+                            plugin_key,
+                        )
+                    )
+
+        self.run_plugins()
+        self.start_check_timer()
+
+    def __del__(self):
+        self.shutdown()
+
+    def shutdown(self):
+        self.stop_check_timer()
+        self.run_cleanups()
+
+    def start_check_timer(self):
+        check_interval = self.config["monitoring_interval"]
+        self.logger.out(
+            f"Starting monitoring check timer ({check_interval} second interval)",
+            state="s",
+        )
+        self.check_timer = BackgroundScheduler()
+        self.check_timer.add_job(
+            self.run_plugins,
+            trigger="interval",
+            seconds=check_interval,
+        )
+        self.check_timer.start()
+
+    def stop_check_timer(self):
+        try:
+            self.check_timer.shutdown()
+            self.logger.out("Stopping monitoring check timer", state="s")
+        except Exception:
+            self.logger.out("Failed to stop monitoring check timer", state="w")
+
+    def run_plugin(self, plugin):
+        time_start = datetime.now()
+        try:
+            result = plugin.run(coordinator_state=self.this_node.coordinator_state)
+        except Exception as e:
+            self.logger.out(
+                f"Monitoring plugin {plugin.plugin_name} failed: {type(e).__name__}: {e}",
+                state="e",
+            )
+            # Whatever it had, we try to return
+            return plugin.plugin_result
+        time_end = datetime.now()
+        time_delta = time_end - time_start
+        runtime = "{:0.02f}".format(time_delta.total_seconds())
+        result.set_runtime(runtime)
+        result.to_zookeeper()
+        return result
+
+    def run_plugins(self):
+        if self.this_node.coordinator_state == "primary":
+            cst_colour = self.logger.fmt_green
+        elif self.this_node.coordinator_state == "secondary":
+            cst_colour = self.logger.fmt_blue
+        else:
+            cst_colour = self.logger.fmt_cyan
+
+        active_coordinator_state = self.this_node.coordinator_state
+
+        runtime_start = datetime.now()
+        total_health = 100
+        plugin_results = list()
+        with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
+            to_future_plugin_results = {
+                executor.submit(self.run_plugin, plugin): plugin
+                for plugin in self.all_plugins
+            }
+            for future in concurrent.futures.as_completed(to_future_plugin_results):
+                plugin_results.append(future.result())
+
+        for result in sorted(plugin_results, key=lambda x: x.plugin_name):
+            if (
+                self.config["log_keepalives"]
+                and self.config["log_keepalive_plugin_details"]
+            ):
+                self.logger.out(
+                    result.message + f" [-{result.health_delta}]",
+                    state="t",
+                    prefix=f"{result.plugin_name} ({result.runtime}s)",
+                )
+            total_health -= result.health_delta
+
+        if total_health < 0:
+            total_health = 0
+
+        self.zkhandler.write(
+            [
+                (
+                    ("node.monitoring.health", self.this_node.name),
+                    total_health,
+                ),
+            ]
+        )
+
+        runtime_end = datetime.now()
+        runtime_delta = runtime_end - runtime_start
+        runtime = "{:0.02f}".format(runtime_delta.total_seconds())
+        time.sleep(0.2)
+
+        if isinstance(self.this_node.health, int):
+            if self.this_node.health > 90:
+                health_colour = self.logger.fmt_green
+            elif self.this_node.health > 50:
+                health_colour = self.logger.fmt_yellow
+            else:
+                health_colour = self.logger.fmt_red
+            health_text = str(self.this_node.health) + "%"
+        else:
+            health_colour = self.logger.fmt_blue
+            health_text = "N/A"
+
+        self.logger.out(
+            "{start_colour}{hostname} healthcheck @ {starttime}{nofmt} [{cst_colour}{costate}{nofmt}] result is {health_colour}{health}{nofmt} in {runtime} seconds".format(
+                start_colour=self.logger.fmt_purple,
+                cst_colour=self.logger.fmt_bold + cst_colour,
+                health_colour=health_colour,
+                nofmt=self.logger.fmt_end,
+                hostname=self.config["node_hostname"],
+                starttime=runtime_start,
+                costate=active_coordinator_state,
+                health=health_text,
+                runtime=runtime,
+            ),
+            state="t",
+        )
+
+    def run_cleanup(self, plugin):
+        return plugin.cleanup()
+
+    def run_cleanups(self):
+        with concurrent.futures.ThreadPoolExecutor(max_workers=99) as executor:
+            to_future_plugin_results = {
+                executor.submit(self.run_cleanup, plugin): plugin
+                for plugin in self.all_plugins
+            }
+            for future in concurrent.futures.as_completed(to_future_plugin_results):
+                # This doesn't do anything, just lets us wait for them all to complete
+                pass
+        # Set the node health to None as no previous checks are now valid
+        self.zkhandler.write(
+            [
+                (
+                    ("node.monitoring.health", self.this_node.name),
+                    None,
+                ),
+            ]
+        )
--- a/health-daemon/pvchealthd/objects/NodeInstance.py
+++ b/health-daemon/pvchealthd/objects/NodeInstance.py
@@ -0,0 +1,224 @@
+#!/usr/bin/env python3
+
+# NodeInstance.py - Class implementing a PVC node in pvchealthd
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+
+class NodeInstance(object):
+    # Initialization function
+    def __init__(
+        self,
+        name,
+        zkhandler,
+        config,
+        logger,
+    ):
+        # Passed-in variables on creation
+        self.name = name
+        self.zkhandler = zkhandler
+        self.config = config
+        self.logger = logger
+        # States
+        self.daemon_state = "stop"
+        self.coordinator_state = "client"
+        self.domain_state = "flushed"
+        # Node resources
+        self.health = 100
+        self.active_domains_count = 0
+        self.provisioned_domains_count = 0
+        self.memused = 0
+        self.memfree = 0
+        self.memalloc = 0
+        self.vcpualloc = 0
+
+        # Zookeeper handlers for changed states
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.state.daemon", self.name)
+        )
+        def watch_node_daemonstate(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = "stop"
+
+            if data != self.daemon_state:
+                self.daemon_state = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.state.router", self.name)
+        )
+        def watch_node_routerstate(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = "client"
+
+            if data != self.coordinator_state:
+                self.coordinator_state = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.state.domain", self.name)
+        )
+        def watch_node_domainstate(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = "unknown"
+
+            if data != self.domain_state:
+                self.domain_state = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.monitoring.health", self.name)
+        )
+        def watch_node_health(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = 100
+
+            try:
+                data = int(data)
+            except ValueError:
+                pass
+
+            if data != self.health:
+                self.health = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.memory.free", self.name)
+        )
+        def watch_node_memfree(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = 0
+
+            if data != self.memfree:
+                self.memfree = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.memory.used", self.name)
+        )
+        def watch_node_memused(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = 0
+
+            if data != self.memused:
+                self.memused = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.memory.allocated", self.name)
+        )
+        def watch_node_memalloc(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = 0
+
+            if data != self.memalloc:
+                self.memalloc = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.vcpu.allocated", self.name)
+        )
+        def watch_node_vcpualloc(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = 0
+
+            if data != self.vcpualloc:
+                self.vcpualloc = data
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.running_domains", self.name)
+        )
+        def watch_node_runningdomains(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii").split()
+            except AttributeError:
+                data = []
+
+            if len(data) != self.active_domains_count:
+                self.active_domains_count = len(data)
+
+        @self.zkhandler.zk_conn.DataWatch(
+            self.zkhandler.schema.path("node.count.provisioned_domains", self.name)
+        )
+        def watch_node_domainscount(data, stat, event=""):
+            if event and event.type == "DELETED":
+                # The key has been deleted after existing before; terminate this watcher
+                # because this class instance is about to be reaped in Daemon.py
+                return False
+
+            try:
+                data = data.decode("ascii")
+            except AttributeError:
+                data = 0
+
+            if data != self.provisioned_domains_count:
+                self.provisioned_domains_count = data
--- a/health-daemon/pvchealthd/objects/init.py
+++ b/health-daemon/pvchealthd/objects/init.py
--- a/health-daemon/pvchealthd/util/init.py
+++ b/health-daemon/pvchealthd/util/init.py
--- a/health-daemon/pvchealthd/util/config.py
+++ b/health-daemon/pvchealthd/util/config.py
@@ -0,0 +1,696 @@
+#!/usr/bin/env python3
+
+# config.py - Utility functions for pvcnoded configuration parsing
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import daemon_lib.common as common
+
+import os
+import subprocess
+import yaml
+
+from socket import gethostname
+from re import findall
+from psutil import cpu_count
+from ipaddress import ip_address, ip_network
+from json import loads
+
+
+class MalformedConfigurationError(Exception):
+    """
+    An except when parsing the PVC Node daemon configuration file
+    """
+
+    def __init__(self, error=None):
+        self.msg = f"ERROR: Configuration file is malformed: {error}"
+
+    def __str__(self):
+        return str(self.msg)
+
+
+def get_static_data():
+    """
+    Data that is obtained once at node startup for use later
+    """
+    staticdata = list()
+    staticdata.append(str(cpu_count()))  # CPU count
+    staticdata.append(
+        subprocess.run(["uname", "-r"], stdout=subprocess.PIPE)
+        .stdout.decode("ascii")
+        .strip()
+    )
+    staticdata.append(
+        subprocess.run(["uname", "-o"], stdout=subprocess.PIPE)
+        .stdout.decode("ascii")
+        .strip()
+    )
+    staticdata.append(
+        subprocess.run(["uname", "-m"], stdout=subprocess.PIPE)
+        .stdout.decode("ascii")
+        .strip()
+    )
+
+    return staticdata
+
+
+def get_configuration_path():
+    config_file = None
+    try:
+        _config_file = "/etc/pvc/pvcnoded.yaml"
+        if not os.path.exists(_config_file):
+            raise
+        config_file = _config_file
+        config_type = "legacy"
+    except Exception:
+        pass
+    try:
+        _config_file = os.environ["PVC_CONFIG_FILE"]
+        if not os.path.exists(_config_file):
+            raise
+        config_file = _config_file
+        config_type = "current"
+    except Exception:
+        pass
+
+    if not config_file:
+        print('ERROR: The "PVC_CONFIG_FILE" environment variable must be set.')
+        os._exit(1)
+
+    return config_file, config_type
+
+
+def get_hostname():
+    node_fqdn = gethostname()
+    node_hostname = node_fqdn.split(".", 1)[0]
+    node_domain = "".join(node_fqdn.split(".", 1)[1:])
+    try:
+        node_id = findall(r"\d+", node_hostname)[-1]
+    except IndexError:
+        node_id = 0
+
+    return node_fqdn, node_hostname, node_domain, node_id
+
+
+def validate_floating_ip(config, network):
+    if network not in ["cluster", "storage", "upstream"]:
+        return False, f'Specified network type "{network}" is not valid'
+
+    floating_key = f"{network}_floating_ip"
+    network_key = f"{network}_network"
+
+    # Verify the network provided is valid
+    try:
+        network = ip_network(config[network_key])
+    except Exception:
+        return (
+            False,
+            f"Network address {config[network_key]} for {network_key} is not valid",
+        )
+
+    # Verify that the floating IP is valid (and in the network)
+    try:
+        floating_address = ip_address(config[floating_key].split("/")[0])
+        if floating_address not in list(network.hosts()):
+            raise
+    except Exception:
+        return (
+            False,
+            f"Floating address {config[floating_key]} for {floating_key} is not valid",
+        )
+
+    return True, ""
+
+
+def get_configuration_current(config_file):
+    print('Loading configuration from file "{}"'.format(config_file))
+
+    with open(config_file, "r") as cfgfh:
+        try:
+            o_config = yaml.load(cfgfh, Loader=yaml.SafeLoader)
+        except Exception as e:
+            print(f"ERROR: Failed to parse configuration file: {e}")
+            os._exit(1)
+
+    config = dict()
+
+    node_fqdn, node_hostname, node_domain, node_id = get_hostname()
+
+    config_thisnode = {
+        "node": node_hostname,
+        "node_hostname": node_hostname,
+        "node_fqdn": node_fqdn,
+        "node_domain": node_domain,
+        "node_id": node_id,
+    }
+    config = {**config, **config_thisnode}
+
+    try:
+        o_path = o_config["path"]
+        config_path = {
+            "plugin_directory": o_path.get(
+                "plugin_directory", "/usr/share/pvc/plugins"
+            ),
+            "dynamic_directory": o_path["dynamic_directory"],
+            "log_directory": o_path["system_log_directory"],
+            "console_log_directory": o_path["console_log_directory"],
+            "ceph_directory": o_path["ceph_directory"],
+        }
+        # Define our dynamic directory schema
+        config_path["dnsmasq_dynamic_directory"] = (
+            config_path["dynamic_directory"] + "/dnsmasq"
+        )
+        config_path["pdns_dynamic_directory"] = (
+            config_path["dynamic_directory"] + "/pdns"
+        )
+        config_path["nft_dynamic_directory"] = config_path["dynamic_directory"] + "/nft"
+        # Define our log directory schema
+        config_path["dnsmasq_log_directory"] = config_path["log_directory"] + "/dnsmasq"
+        config_path["pdns_log_directory"] = config_path["log_directory"] + "/pdns"
+        config_path["nft_log_directory"] = config_path["log_directory"] + "/nft"
+        config = {**config, **config_path}
+
+        o_subsystem = o_config["subsystem"]
+        config_subsystem = {
+            "enable_hypervisor": o_subsystem.get("enable_hypervisor", True),
+            "enable_networking": o_subsystem.get("enable_networking", True),
+            "enable_storage": o_subsystem.get("enable_storage", True),
+            "enable_worker": o_subsystem.get("enable_worker", True),
+            "enable_api": o_subsystem.get("enable_api", True),
+        }
+        config = {**config, **config_subsystem}
+
+        o_cluster = o_config["cluster"]
+        config_cluster = {
+            "cluster_name": o_cluster["name"],
+            "all_nodes": o_cluster["all_nodes"],
+            "coordinators": o_cluster["coordinator_nodes"],
+        }
+        config = {**config, **config_cluster}
+
+        o_cluster_networks = o_cluster["networks"]
+        for network_type in ["cluster", "storage", "upstream"]:
+            o_cluster_networks_specific = o_cluster_networks[network_type]
+            config_cluster_networks_specific = {
+                f"{network_type}_domain": o_cluster_networks_specific["domain"],
+                f"{network_type}_dev": o_cluster_networks_specific["device"],
+                f"{network_type}_mtu": o_cluster_networks_specific["mtu"],
+                f"{network_type}_network": o_cluster_networks_specific["ipv4"][
+                    "network_address"
+                ]
+                + "/"
+                + str(o_cluster_networks_specific["ipv4"]["netmask"]),
+                f"{network_type}_floating_ip": o_cluster_networks_specific["ipv4"][
+                    "floating_address"
+                ]
+                + "/"
+                + str(o_cluster_networks_specific["ipv4"]["netmask"]),
+                f"{network_type}_node_ip_selection": o_cluster_networks_specific[
+                    "node_ip_selection"
+                ],
+            }
+
+            if (
+                o_cluster_networks_specific["ipv4"].get("gateway_address", None)
+                is not None
+            ):
+                config[f"{network_type}_gateway"] = o_cluster_networks_specific["ipv4"][
+                    "gateway_address"
+                ]
+
+            result, msg = validate_floating_ip(
+                config_cluster_networks_specific, network_type
+            )
+            if not result:
+                raise MalformedConfigurationError(msg)
+
+            network = ip_network(
+                config_cluster_networks_specific[f"{network_type}_network"]
+            )
+
+            if (
+                config_cluster_networks_specific[f"{network_type}_node_ip_selection"]
+                == "by-id"
+            ):
+                address_id = int(node_id) - 1
+            else:
+                # This roundabout solution ensures the given IP is in the subnet and is something valid
+                address_id = [
+                    idx
+                    for idx, ip in enumerate(list(network.hosts()))
+                    if str(ip)
+                    == config_cluster_networks_specific[
+                        f"{network_type}_node_ip_selection"
+                    ]
+                ][0]
+
+            config_cluster_networks_specific[
+                f"{network_type}_dev_ip"
+            ] = f"{list(network.hosts())[address_id]}/{network.prefixlen}"
+
+            config = {**config, **config_cluster_networks_specific}
+
+        o_database = o_config["database"]
+        config_database = {
+            "zookeeper_port": o_database["zookeeper"]["port"],
+            "keydb_port": o_database["keydb"]["port"],
+            "keydb_host": o_database["keydb"]["hostname"],
+            "keydb_path": o_database["keydb"]["path"],
+            "metadata_postgresql_port": o_database["postgres"]["port"],
+            "metadata_postgresql_host": o_database["postgres"]["hostname"],
+            "metadata_postgresql_dbname": o_database["postgres"]["credentials"]["api"][
+                "database"
+            ],
+            "metadata_postgresql_user": o_database["postgres"]["credentials"]["api"][
+                "username"
+            ],
+            "metadata_postgresql_password": o_database["postgres"]["credentials"][
+                "api"
+            ]["password"],
+            "pdns_postgresql_port": o_database["postgres"]["port"],
+            "pdns_postgresql_host": o_database["postgres"]["hostname"],
+            "pdns_postgresql_dbname": o_database["postgres"]["credentials"]["dns"][
+                "database"
+            ],
+            "pdns_postgresql_user": o_database["postgres"]["credentials"]["dns"][
+                "username"
+            ],
+            "pdns_postgresql_password": o_database["postgres"]["credentials"]["dns"][
+                "password"
+            ],
+        }
+        config = {**config, **config_database}
+
+        o_timer = o_config["timer"]
+        config_timer = {
+            "vm_shutdown_timeout": int(o_timer.get("vm_shutdown_timeout", 180)),
+            "keepalive_interval": int(o_timer.get("keepalive_interval", 5)),
+            "monitoring_interval": int(o_timer.get("monitoring_interval", 60)),
+        }
+        config = {**config, **config_timer}
+
+        o_fencing = o_config["fencing"]
+        config_fencing = {
+            "disable_on_ipmi_failure": o_fencing["disable_on_ipmi_failure"],
+            "fence_intervals": int(o_fencing["intervals"].get("fence_intervals", 6)),
+            "suicide_intervals": int(o_fencing["intervals"].get("suicide_interval", 0)),
+            "successful_fence": o_fencing["actions"].get("successful_fence", None),
+            "failed_fence": o_fencing["actions"].get("failed_fence", None),
+            "ipmi_hostname": o_fencing["ipmi"]["hostname"].format(node_id=node_id),
+            "ipmi_username": o_fencing["ipmi"]["username"],
+            "ipmi_password": o_fencing["ipmi"]["password"],
+        }
+        config = {**config, **config_fencing}
+
+        o_migration = o_config["migration"]
+        config_migration = {
+            "migration_target_selector": o_migration.get("target_selector", "mem"),
+        }
+        config = {**config, **config_migration}
+
+        o_logging = o_config["logging"]
+        config_logging = {
+            "debug": o_logging.get("debug_logging", False),
+            "file_logging": o_logging.get("file_logging", False),
+            "stdout_logging": o_logging.get("stdout_logging", False),
+            "zookeeper_logging": o_logging.get("zookeeper_logging", False),
+            "log_colours": o_logging.get("log_colours", False),
+            "log_dates": o_logging.get("log_dates", False),
+            "log_keepalives": o_logging.get("log_keepalives", False),
+            "log_keepalive_cluster_details": o_logging.get(
+                "log_cluster_details", False
+            ),
+            "log_keepalive_plugin_details": o_logging.get(
+                "log_monitoring_details", False
+            ),
+            "console_log_lines": o_logging.get("console_log_lines", False),
+            "node_log_lines": o_logging.get("node_log_lines", False),
+        }
+        config = {**config, **config_logging}
+
+        o_guest_networking = o_config["guest_networking"]
+        config_guest_networking = {
+            "bridge_dev": o_guest_networking["bridge_device"],
+            "bridge_mtu": o_guest_networking["bridge_mtu"],
+            "enable_sriov": o_guest_networking.get("sriov_enable", False),
+            "sriov_device": o_guest_networking.get("sriov_device", list()),
+        }
+        config = {**config, **config_guest_networking}
+
+        o_ceph = o_config["ceph"]
+        config_ceph = {
+            "ceph_config_file": config["ceph_directory"]
+            + "/"
+            + o_ceph["ceph_config_file"],
+            "ceph_admin_keyring": config["ceph_directory"]
+            + "/"
+            + o_ceph["ceph_keyring_file"],
+            "ceph_monitor_port": o_ceph["monitor_port"],
+            "ceph_secret_uuid": o_ceph["secret_uuid"],
+        }
+        config = {**config, **config_ceph}
+
+        # Add our node static data to the config
+        config["static_data"] = get_static_data()
+
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    return config
+
+
+def get_configuration_legacy(pvcnoded_config_file):
+    print('Loading configuration from file "{}"'.format(pvcnoded_config_file))
+
+    with open(pvcnoded_config_file, "r") as cfgfile:
+        try:
+            o_config = yaml.load(cfgfile, Loader=yaml.SafeLoader)
+        except Exception as e:
+            print("ERROR: Failed to parse configuration file: {}".format(e))
+            os._exit(1)
+
+    node_fqdn, node_hostname, node_domain, node_id = get_hostname()
+
+    # Create the configuration dictionary
+    config = dict()
+
+    # Get the initial base configuration
+    try:
+        o_base = o_config["pvc"]
+        o_cluster = o_config["pvc"]["cluster"]
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    config_general = {
+        "node": o_base.get("node", node_hostname),
+        "node_hostname": node_hostname,
+        "node_fqdn": node_fqdn,
+        "node_domain": node_domain,
+        "node_id": node_id,
+        "coordinators": o_cluster.get("coordinators", list()),
+        "debug": o_base.get("debug", False),
+    }
+
+    config = {**config, **config_general}
+
+    # Get the functions configuration
+    try:
+        o_functions = o_config["pvc"]["functions"]
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    config_functions = {
+        "enable_hypervisor": o_functions.get("enable_hypervisor", False),
+        "enable_networking": o_functions.get("enable_networking", False),
+        "enable_storage": o_functions.get("enable_storage", False),
+        "enable_worker": o_functions.get("enable_worker", True),
+        "enable_api": o_functions.get("enable_api", False),
+    }
+
+    config = {**config, **config_functions}
+
+    # Get the directory configuration
+    try:
+        o_directories = o_config["pvc"]["system"]["configuration"]["directories"]
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    config_directories = {
+        "plugin_directory": o_directories.get(
+            "plugin_directory", "/usr/share/pvc/plugins"
+        ),
+        "dynamic_directory": o_directories.get("dynamic_directory", None),
+        "log_directory": o_directories.get("log_directory", None),
+        "console_log_directory": o_directories.get("console_log_directory", None),
+    }
+
+    # Define our dynamic directory schema
+    config_directories["dnsmasq_dynamic_directory"] = (
+        config_directories["dynamic_directory"] + "/dnsmasq"
+    )
+    config_directories["pdns_dynamic_directory"] = (
+        config_directories["dynamic_directory"] + "/pdns"
+    )
+    config_directories["nft_dynamic_directory"] = (
+        config_directories["dynamic_directory"] + "/nft"
+    )
+
+    # Define our log directory schema
+    config_directories["dnsmasq_log_directory"] = (
+        config_directories["log_directory"] + "/dnsmasq"
+    )
+    config_directories["pdns_log_directory"] = (
+        config_directories["log_directory"] + "/pdns"
+    )
+    config_directories["nft_log_directory"] = (
+        config_directories["log_directory"] + "/nft"
+    )
+
+    config = {**config, **config_directories}
+
+    # Get the logging configuration
+    try:
+        o_logging = o_config["pvc"]["system"]["configuration"]["logging"]
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    config_logging = {
+        "file_logging": o_logging.get("file_logging", False),
+        "stdout_logging": o_logging.get("stdout_logging", False),
+        "zookeeper_logging": o_logging.get("zookeeper_logging", False),
+        "log_colours": o_logging.get("log_colours", False),
+        "log_dates": o_logging.get("log_dates", False),
+        "log_keepalives": o_logging.get("log_keepalives", False),
+        "log_keepalive_cluster_details": o_logging.get(
+            "log_keepalive_cluster_details", False
+        ),
+        "log_keepalive_plugin_details": o_logging.get(
+            "log_keepalive_plugin_details", False
+        ),
+        "console_log_lines": o_logging.get("console_log_lines", False),
+        "node_log_lines": o_logging.get("node_log_lines", False),
+    }
+
+    config = {**config, **config_logging}
+
+    # Get the interval configuration
+    try:
+        o_intervals = o_config["pvc"]["system"]["intervals"]
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    config_intervals = {
+        "vm_shutdown_timeout": int(o_intervals.get("vm_shutdown_timeout", 60)),
+        "keepalive_interval": int(o_intervals.get("keepalive_interval", 5)),
+        "monitoring_interval": int(o_intervals.get("monitoring_interval", 60)),
+        "fence_intervals": int(o_intervals.get("fence_intervals", 6)),
+        "suicide_intervals": int(o_intervals.get("suicide_interval", 0)),
+    }
+
+    config = {**config, **config_intervals}
+
+    # Get the fencing configuration
+    try:
+        o_fencing = o_config["pvc"]["system"]["fencing"]
+        o_fencing_actions = o_fencing["actions"]
+        o_fencing_ipmi = o_fencing["ipmi"]
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    config_fencing = {
+        "successful_fence": o_fencing_actions.get("successful_fence", None),
+        "failed_fence": o_fencing_actions.get("failed_fence", None),
+        "ipmi_hostname": o_fencing_ipmi.get(
+            "host", f"{node_hostname}-lom.{node_domain}"
+        ),
+        "ipmi_username": o_fencing_ipmi.get("user", "null"),
+        "ipmi_password": o_fencing_ipmi.get("pass", "null"),
+    }
+
+    config = {**config, **config_fencing}
+
+    # Get the migration configuration
+    try:
+        o_migration = o_config["pvc"]["system"]["migration"]
+    except Exception as e:
+        raise MalformedConfigurationError(e)
+
+    config_migration = {
+        "migration_target_selector": o_migration.get("target_selector", "mem"),
+    }
+
+    config = {**config, **config_migration}
+
+    if config["enable_networking"]:
+        # Get the node networks configuration
+        try:
+            o_networks = o_config["pvc"]["cluster"]["networks"]
+            o_network_cluster = o_networks["cluster"]
+            o_network_storage = o_networks["storage"]
+            o_network_upstream = o_networks["upstream"]
+            o_sysnetworks = o_config["pvc"]["system"]["configuration"]["networking"]
+            o_sysnetwork_cluster = o_sysnetworks["cluster"]
+            o_sysnetwork_storage = o_sysnetworks["storage"]
+            o_sysnetwork_upstream = o_sysnetworks["upstream"]
+        except Exception as e:
+            raise MalformedConfigurationError(e)
+
+        config_networks = {
+            "cluster_domain": o_network_cluster.get("domain", None),
+            "cluster_network": o_network_cluster.get("network", None),
+            "cluster_floating_ip": o_network_cluster.get("floating_ip", None),
+            "cluster_dev": o_sysnetwork_cluster.get("device", None),
+            "cluster_mtu": o_sysnetwork_cluster.get("mtu", None),
+            "cluster_dev_ip": o_sysnetwork_cluster.get("address", None),
+            "storage_domain": o_network_storage.get("domain", None),
+            "storage_network": o_network_storage.get("network", None),
+            "storage_floating_ip": o_network_storage.get("floating_ip", None),
+            "storage_dev": o_sysnetwork_storage.get("device", None),
+            "storage_mtu": o_sysnetwork_storage.get("mtu", None),
+            "storage_dev_ip": o_sysnetwork_storage.get("address", None),
+            "upstream_domain": o_network_upstream.get("domain", None),
+            "upstream_network": o_network_upstream.get("network", None),
+            "upstream_floating_ip": o_network_upstream.get("floating_ip", None),
+            "upstream_gateway": o_network_upstream.get("gateway", None),
+            "upstream_dev": o_sysnetwork_upstream.get("device", None),
+            "upstream_mtu": o_sysnetwork_upstream.get("mtu", None),
+            "upstream_dev_ip": o_sysnetwork_upstream.get("address", None),
+            "bridge_dev": o_sysnetworks.get("bridge_device", None),
+            "bridge_mtu": o_sysnetworks.get("bridge_mtu", None),
+            "enable_sriov": o_sysnetworks.get("sriov_enable", False),
+            "sriov_device": o_sysnetworks.get("sriov_device", list()),
+        }
+
+        if config_networks["bridge_mtu"] is None:
+            # Read the current MTU of bridge_dev and set bridge_mtu to it; avoids weird resets
+            retcode, stdout, stderr = common.run_os_command(
+                f"ip -json link show dev {config_networks['bridge_dev']}"
+            )
+            current_bridge_mtu = loads(stdout)[0]["mtu"]
+            print(
+                f"Config key bridge_mtu not explicitly set; using live MTU {current_bridge_mtu} from {config_networks['bridge_dev']}"
+            )
+            config_networks["bridge_mtu"] = current_bridge_mtu
+
+        config = {**config, **config_networks}
+
+        for network_type in ["cluster", "storage", "upstream"]:
+            result, msg = validate_floating_ip(config, network_type)
+            if not result:
+                raise MalformedConfigurationError(msg)
+
+            address_key = "{}_dev_ip".format(network_type)
+            network_key = f"{network_type}_network"
+            network = ip_network(config[network_key])
+            # With autoselection of addresses, construct an IP from the relevant network
+            if config[address_key] == "by-id":
+                # The NodeID starts at 1, but indexes start at 0
+                address_id = int(config["node_id"]) - 1
+                # Grab the nth address from the network
+                config[address_key] = "{}/{}".format(
+                    list(network.hosts())[address_id], network.prefixlen
+                )
+            # Validate the provided IP instead
+            else:
+                try:
+                    address = ip_address(config[address_key].split("/")[0])
+                    if address not in list(network.hosts()):
+                        raise
+                except Exception:
+                    raise MalformedConfigurationError(
+                        f"IP address {config[address_key]} for {address_key} is not valid"
+                    )
+
+        # Get the PowerDNS aggregator database configuration
+        try:
+            o_pdnsdb = o_config["pvc"]["coordinator"]["dns"]["database"]
+        except Exception as e:
+            raise MalformedConfigurationError(e)
+
+        config_pdnsdb = {
+            "pdns_postgresql_host": o_pdnsdb.get("host", None),
+            "pdns_postgresql_port": o_pdnsdb.get("port", None),
+            "pdns_postgresql_dbname": o_pdnsdb.get("name", None),
+            "pdns_postgresql_user": o_pdnsdb.get("user", None),
+            "pdns_postgresql_password": o_pdnsdb.get("pass", None),
+        }
+
+        config = {**config, **config_pdnsdb}
+
+        # Get the Cloud-Init Metadata database configuration
+        try:
+            o_metadatadb = o_config["pvc"]["coordinator"]["metadata"]["database"]
+        except Exception as e:
+            raise MalformedConfigurationError(e)
+
+        config_metadatadb = {
+            "metadata_postgresql_host": o_metadatadb.get("host", None),
+            "metadata_postgresql_port": o_metadatadb.get("port", None),
+            "metadata_postgresql_dbname": o_metadatadb.get("name", None),
+            "metadata_postgresql_user": o_metadatadb.get("user", None),
+            "metadata_postgresql_password": o_metadatadb.get("pass", None),
+        }
+
+        config = {**config, **config_metadatadb}
+
+    if config["enable_storage"]:
+        # Get the storage configuration
+        try:
+            o_storage = o_config["pvc"]["system"]["configuration"]["storage"]
+        except Exception as e:
+            raise MalformedConfigurationError(e)
+
+        config_storage = {
+            "ceph_config_file": o_storage.get("ceph_config_file", None),
+            "ceph_admin_keyring": o_storage.get("ceph_admin_keyring", None),
+        }
+
+        config = {**config, **config_storage}
+
+        # Add our node static data to the config
+        config["static_data"] = get_static_data()
+
+    return config
+
+
+def get_configuration():
+    """
+    Parse the configuration of the node daemon.
+    """
+    pvc_config_file, pvc_config_type = get_configuration_path()
+
+    if pvc_config_type == "legacy":
+        config = get_configuration_legacy(pvc_config_file)
+    else:
+        config = get_configuration_current(pvc_config_file)
+
+    return config
+
+
+def validate_directories(config):
+    if not os.path.exists(config["dynamic_directory"]):
+        os.makedirs(config["dynamic_directory"])
+        os.makedirs(config["dnsmasq_dynamic_directory"])
+        os.makedirs(config["pdns_dynamic_directory"])
+        os.makedirs(config["nft_dynamic_directory"])
+
+    if not os.path.exists(config["log_directory"]):
+        os.makedirs(config["log_directory"])
+        os.makedirs(config["dnsmasq_log_directory"])
+        os.makedirs(config["pdns_log_directory"])
+        os.makedirs(config["nft_log_directory"])
--- a/health-daemon/pvchealthd/util/fencing.py
+++ b/health-daemon/pvchealthd/util/fencing.py
@@ -0,0 +1,338 @@
+#!/usr/bin/env python3
+
+# fencing.py - Utility functions for pvcnoded fencing
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import time
+
+import daemon_lib.common as common
+
+from daemon_lib.vm import vm_worker_flush_locks
+
+
+#
+# Fence thread entry function
+#
+def fence_node(node_name, zkhandler, config, logger):
+    # We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it
+    failcount_limit = 6
+    failcount = 0
+    while failcount < failcount_limit:
+        # Wait 5 seconds
+        time.sleep(config["keepalive_interval"])
+        # Get the state
+        node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
+        # Is it still 'dead'
+        if node_daemon_state == "dead":
+            failcount += 1
+            logger.out(
+                f"Node {node_name} failed {failcount}/{failcount_limit} saving throws",
+                state="s",
+                prefix=f"fencing {node_name}",
+            )
+        # It changed back to something else so it must be alive
+        else:
+            logger.out(
+                f"Node {node_name} passed a saving throw; cancelling fance",
+                state="o",
+                prefix=f"fencing {node_name}",
+            )
+            return
+
+    logger.out(
+        f"Fencing node {node_name} via IPMI reboot signal",
+        state="s",
+        prefix=f"fencing {node_name}",
+    )
+
+    # Get IPMI information
+    ipmi_hostname = zkhandler.read(("node.ipmi.hostname", node_name))
+    ipmi_username = zkhandler.read(("node.ipmi.username", node_name))
+    ipmi_password = zkhandler.read(("node.ipmi.password", node_name))
+
+    # Shoot it in the head
+    fence_status = reboot_via_ipmi(
+        node_name, ipmi_hostname, ipmi_username, ipmi_password, logger
+    )
+
+    # Hold to ensure the fence takes effect and system stabilizes
+    logger.out(
+        f"Waiting {config['keepalive_interval']}s for fence of node {node_name} to take effect",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+    time.sleep(config["keepalive_interval"])
+
+    if fence_status:
+        logger.out(
+            f"Marking node {node_name} as fenced",
+            state="i",
+            prefix=f"fencing {node_name}",
+        )
+        while True:
+            try:
+                zkhandler.write([(("node.state.daemon", node_name), "fenced")])
+                break
+            except Exception:
+                continue
+
+    # Force into secondary network state if needed
+    if node_name in config["coordinators"]:
+        logger.out(
+            f"Forcing secondary coordinator state for node {node_name}",
+            state="i",
+            prefix=f"fencing {node_name}",
+        )
+        zkhandler.write([(("node.state.router", node_name), "secondary")])
+        if zkhandler.read("base.config.primary_node") == node_name:
+            zkhandler.write([("base.config.primary_node", "none")])
+
+    # If the fence succeeded and successful_fence is migrate
+    if fence_status and config["successful_fence"] == "migrate":
+        migrateFromFencedNode(zkhandler, node_name, config, logger)
+
+    # If the fence failed and failed_fence is migrate
+    if (
+        not fence_status
+        and config["failed_fence"] == "migrate"
+        and config["suicide_intervals"] != "0"
+    ):
+        migrateFromFencedNode(zkhandler, node_name, config, logger)
+
+
+# Migrate hosts away from a fenced node
+def migrateFromFencedNode(zkhandler, node_name, config, logger):
+    logger.out(
+        f"Migrating VMs from dead node {node_name} to new hosts",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+
+    # Get the list of VMs
+    dead_node_running_domains = zkhandler.read(
+        ("node.running_domains", node_name)
+    ).split()
+
+    # Set the node to a custom domainstate so we know what's happening
+    zkhandler.write([(("node.state.domain", node_name), "fence-flush")])
+
+    # Migrate a VM after a flush
+    def fence_migrate_vm(dom_uuid):
+        logger.out(
+            f"Flushing locks of VM {dom_uuid} due to fence",
+            state="i",
+            prefix=f"fencing {node_name}",
+        )
+        vm_worker_flush_locks(zkhandler, None, dom_uuid, force_unlock=True)
+
+        target_node = common.findTargetNode(zkhandler, dom_uuid)
+
+        if target_node is not None:
+            logger.out(
+                f"Migrating VM {dom_uuid} to node {target_node}",
+                state="i",
+                prefix=f"fencing {node_name}",
+            )
+            zkhandler.write(
+                [
+                    (("domain.state", dom_uuid), "start"),
+                    (("domain.node", dom_uuid), target_node),
+                    (("domain.last_node", dom_uuid), node_name),
+                ]
+            )
+            logger.out(
+                f"Successfully migrated running VM {dom_uuid} to node {target_node}",
+                state="o",
+                prefix=f"fencing {node_name}",
+            )
+        else:
+            logger.out(
+                f"No target node found for VM {dom_uuid}; marking autostart=True on current node",
+                state="i",
+                prefix=f"fencing {node_name}",
+            )
+            zkhandler.write(
+                {
+                    (("domain.state", dom_uuid), "stopped"),
+                    (("domain.meta.autostart", dom_uuid), "True"),
+                }
+            )
+            logger.out(
+                f"Successfully marked autostart for running VM {dom_uuid} on current node",
+                state="o",
+                prefix=f"fencing {node_name}",
+            )
+
+    # Loop through the VMs
+    for dom_uuid in dead_node_running_domains:
+        try:
+            fence_migrate_vm(dom_uuid)
+        except Exception as e:
+            logger.out(
+                f"Failed to migrate VM {dom_uuid}, continuing: {e}",
+                state="w",
+                prefix=f"fencing {node_name}",
+            )
+
+    # Set node in flushed state for easy remigrating when it comes back
+    zkhandler.write([(("node.state.domain", node_name), "flushed")])
+    logger.out(
+        f"All VMs flushed from dead node {node_name} to other nodes",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+
+
+#
+# Perform an IPMI fence
+#
+def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
+    # Power off the node the node
+    logger.out(
+        "Sending power off to dead node",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+    ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command(
+        f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power off"
+    )
+    if ipmi_stop_retcode != 0:
+        logger.out(
+            f"Failed to power off dead node: {ipmi_stop_stderr}",
+            state="e",
+            prefix=f"fencing {node_name}",
+        )
+
+    logger.out(
+        "Waiting 5s for power off to take effect",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+    time.sleep(5)
+
+    # Check the chassis power state
+    logger.out(
+        "Checking power state of dead node",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
+        f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
+    )
+    if ipmi_status_retcode == 0:
+        logger.out(
+            f"Current chassis power state is: {ipmi_status_stdout.strip()}",
+            state="i",
+            prefix=f"fencing {node_name}",
+        )
+    else:
+        logger.out(
+            "Current chassis power state is: Unknown",
+            state="w",
+            prefix=f"fencing {node_name}",
+        )
+
+    # Power on the node
+    logger.out(
+        "Sending power on to dead node",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+    ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(
+        f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power on"
+    )
+
+    if ipmi_start_retcode != 0:
+        logger.out(
+            f"Failed to power on dead node: {ipmi_start_stderr}",
+            state="w",
+            prefix=f"fencing {node_name}",
+        )
+
+    logger.out(
+        "Waiting 2s for power on to take effect",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+    time.sleep(2)
+
+    # Check the chassis power state
+    logger.out(
+        "Checking power state of dead node",
+        state="i",
+        prefix=f"fencing {node_name}",
+    )
+    ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
+        f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
+    )
+
+    if ipmi_stop_retcode == 0:
+        if ipmi_status_stdout.strip() == "Chassis Power is on":
+            # We successfully rebooted the node and it is powered on; this is a succeessful fence
+            logger.out(
+                "Successfully rebooted dead node; proceeding with fence recovery action",
+                state="o",
+                prefix=f"fencing {node_name}",
+            )
+            return True
+        elif ipmi_status_stdout.strip() == "Chassis Power is off":
+            # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
+            logger.out(
+                "Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
+                state="o",
+                prefix=f"fencing {node_name}",
+            )
+            return True
+        else:
+            # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
+            logger.out(
+                f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
+                state="e",
+                prefix=f"fencing {node_name}",
+            )
+            return False
+    else:
+        if ipmi_status_stdout.strip() == "Chassis Power is off":
+            # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
+            logger.out(
+                "Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",
+                state="o",
+                prefix=f"fencing {node_name}",
+            )
+            return True
+        else:
+            # We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence
+            logger.out(
+                "Chassis power is not in confirmed off state after failed IPMI reboot; NOT proceeding wiht fence recovery action",
+                state="e",
+                prefix=f"fencing {node_name}",
+            )
+            return False
+
+
+#
+# Verify that IPMI connectivity to this host exists (used during node init)
+#
+def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password):
+    ipmi_command = f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
+    retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2)
+    if retcode == 0 and stdout.strip() == "Chassis Power is on":
+        return True
+    else:
+        return False
--- a/health-daemon/pvchealthd/util/keepalive.py
+++ b/health-daemon/pvchealthd/util/keepalive.py
@@ -0,0 +1,968 @@
+#!/usr/bin/env python3
+
+# keepalive.py - Utility functions for pvcnoded Keepalives
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import pvcnoded.util.fencing
+
+import daemon_lib.common as common
+
+from apscheduler.schedulers.background import BackgroundScheduler
+from rados import Rados
+from xml.etree import ElementTree
+from queue import Queue
+from threading import Thread
+from datetime import datetime
+
+import json
+import re
+import libvirt
+import psutil
+import os
+import time
+
+
+# State table for pretty stats
+libvirt_vm_states = {
+    0: "NOSTATE",
+    1: "RUNNING",
+    2: "BLOCKED",
+    3: "PAUSED",
+    4: "SHUTDOWN",
+    5: "SHUTOFF",
+    6: "CRASHED",
+    7: "PMSUSPENDED",
+}
+
+
+def start_keepalive_timer(logger, config, zkhandler, this_node):
+    keepalive_interval = config["keepalive_interval"]
+    logger.out(
+        f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
+    )
+    keepalive_timer = BackgroundScheduler()
+    keepalive_timer.add_job(
+        node_keepalive,
+        args=(logger, config, zkhandler, this_node),
+        trigger="interval",
+        seconds=keepalive_interval,
+    )
+    keepalive_timer.start()
+    return keepalive_timer
+
+
+def stop_keepalive_timer(logger, keepalive_timer):
+    try:
+        keepalive_timer.shutdown()
+        logger.out("Stopping keepalive timer", state="s")
+    except Exception:
+        logger.out("Failed to stop keepalive timer", state="w")
+
+
+# Ceph stats update function
+def collect_ceph_stats(logger, config, zkhandler, this_node, queue):
+    pool_list = zkhandler.children("base.pool")
+    osd_list = zkhandler.children("base.osd")
+
+    debug = config["debug"]
+    if debug:
+        logger.out("Thread starting", state="d", prefix="ceph-thread")
+
+    # Connect to the Ceph cluster
+    try:
+        ceph_conn = Rados(
+            conffile=config["ceph_config_file"],
+            conf=dict(keyring=config["ceph_admin_keyring"]),
+        )
+        if debug:
+            logger.out("Connecting to cluster", state="d", prefix="ceph-thread")
+        ceph_conn.connect(timeout=1)
+    except Exception as e:
+        logger.out("Failed to open connection to Ceph cluster: {}".format(e), state="e")
+        return
+
+    # Primary-only functions
+    if this_node.coordinator_state == "primary":
+        # Get Ceph status information (pretty)
+        if debug:
+            logger.out(
+                "Set Ceph status information in zookeeper (primary only)",
+                state="d",
+                prefix="ceph-thread",
+            )
+
+        command = {"prefix": "status", "format": "pretty"}
+        ceph_status = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
+            1
+        ].decode("ascii")
+        try:
+            zkhandler.write([("base.storage", str(ceph_status))])
+        except Exception as e:
+            logger.out("Failed to set Ceph status data: {}".format(e), state="e")
+
+        # Get Ceph health information (JSON)
+        if debug:
+            logger.out(
+                "Set Ceph health information in zookeeper (primary only)",
+                state="d",
+                prefix="ceph-thread",
+            )
+
+        command = {"prefix": "health", "format": "json"}
+        ceph_health = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
+            1
+        ].decode("ascii")
+        try:
+            zkhandler.write([("base.storage.health", str(ceph_health))])
+        except Exception as e:
+            logger.out("Failed to set Ceph health data: {}".format(e), state="e")
+
+        # Get Ceph df information (pretty)
+        if debug:
+            logger.out(
+                "Set Ceph rados df information in zookeeper (primary only)",
+                state="d",
+                prefix="ceph-thread",
+            )
+
+        # Get rados df info
+        command = {"prefix": "df", "format": "pretty"}
+        ceph_df = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[1].decode(
+            "ascii"
+        )
+        try:
+            zkhandler.write([("base.storage.util", str(ceph_df))])
+        except Exception as e:
+            logger.out("Failed to set Ceph utilization data: {}".format(e), state="e")
+
+        if debug:
+            logger.out(
+                "Set pool information in zookeeper (primary only)",
+                state="d",
+                prefix="ceph-thread",
+            )
+
+        # Get pool info
+        command = {"prefix": "df", "format": "json"}
+        ceph_df_output = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
+            1
+        ].decode("ascii")
+        try:
+            ceph_pool_df_raw = json.loads(ceph_df_output)["pools"]
+        except Exception as e:
+            logger.out("Failed to obtain Pool data (ceph df): {}".format(e), state="w")
+            ceph_pool_df_raw = []
+
+        retcode, stdout, stderr = common.run_os_command(
+            "rados df --format json", timeout=1
+        )
+        try:
+            rados_pool_df_raw = json.loads(stdout)["pools"]
+        except Exception as e:
+            logger.out("Failed to obtain Pool data (rados df): {}".format(e), state="w")
+            rados_pool_df_raw = []
+
+        pool_count = len(ceph_pool_df_raw)
+        if debug:
+            logger.out(
+                "Getting info for {} pools".format(pool_count),
+                state="d",
+                prefix="ceph-thread",
+            )
+        for pool_idx in range(0, pool_count):
+            try:
+                # Combine all the data for this pool
+                ceph_pool_df = ceph_pool_df_raw[pool_idx]
+                rados_pool_df = rados_pool_df_raw[pool_idx]
+                pool = ceph_pool_df
+                pool.update(rados_pool_df)
+
+                # Ignore any pools that aren't in our pool list
+                if pool["name"] not in pool_list:
+                    if debug:
+                        logger.out(
+                            "Pool {} not in pool list {}".format(
+                                pool["name"], pool_list
+                            ),
+                            state="d",
+                            prefix="ceph-thread",
+                        )
+                    continue
+                else:
+                    if debug:
+                        logger.out(
+                            "Parsing data for pool {}".format(pool["name"]),
+                            state="d",
+                            prefix="ceph-thread",
+                        )
+
+                # Assemble a useful data structure
+                pool_df = {
+                    "id": pool["id"],
+                    "stored_bytes": pool["stats"]["stored"],
+                    "free_bytes": pool["stats"]["max_avail"],
+                    "used_bytes": pool["stats"]["bytes_used"],
+                    "used_percent": pool["stats"]["percent_used"],
+                    "num_objects": pool["stats"]["objects"],
+                    "num_object_clones": pool["num_object_clones"],
+                    "num_object_copies": pool["num_object_copies"],
+                    "num_objects_missing_on_primary": pool[
+                        "num_objects_missing_on_primary"
+                    ],
+                    "num_objects_unfound": pool["num_objects_unfound"],
+                    "num_objects_degraded": pool["num_objects_degraded"],
+                    "read_ops": pool["read_ops"],
+                    "read_bytes": pool["read_bytes"],
+                    "write_ops": pool["write_ops"],
+                    "write_bytes": pool["write_bytes"],
+                }
+
+                # Write the pool data to Zookeeper
+                zkhandler.write(
+                    [(("pool.stats", pool["name"]), str(json.dumps(pool_df)))]
+                )
+            except Exception as e:
+                # One or more of the status commands timed out, just continue
+                logger.out(
+                    "Failed to format and send pool data: {}".format(e), state="w"
+                )
+                pass
+
+    # Only grab OSD stats if there are OSDs to grab (otherwise `ceph osd df` hangs)
+    osds_this_node = 0
+    if len(osd_list) > 0:
+        # Get data from Ceph OSDs
+        if debug:
+            logger.out("Get data from Ceph OSDs", state="d", prefix="ceph-thread")
+
+        # Parse the dump data
+        osd_dump = dict()
+
+        command = {"prefix": "osd dump", "format": "json"}
+        osd_dump_output = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
+            1
+        ].decode("ascii")
+        try:
+            osd_dump_raw = json.loads(osd_dump_output)["osds"]
+        except Exception as e:
+            logger.out("Failed to obtain OSD data: {}".format(e), state="w")
+            osd_dump_raw = []
+
+        if debug:
+            logger.out("Loop through OSD dump", state="d", prefix="ceph-thread")
+        for osd in osd_dump_raw:
+            osd_dump.update(
+                {
+                    str(osd["osd"]): {
+                        "uuid": osd["uuid"],
+                        "up": osd["up"],
+                        "in": osd["in"],
+                        "primary_affinity": osd["primary_affinity"],
+                    }
+                }
+            )
+
+        # Parse the df data
+        if debug:
+            logger.out("Parse the OSD df data", state="d", prefix="ceph-thread")
+
+        osd_df = dict()
+
+        command = {"prefix": "osd df", "format": "json"}
+        try:
+            osd_df_raw = json.loads(
+                ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[1]
+            )["nodes"]
+        except Exception as e:
+            logger.out("Failed to obtain OSD data: {}".format(e), state="w")
+            osd_df_raw = []
+
+        if debug:
+            logger.out("Loop through OSD df", state="d", prefix="ceph-thread")
+        for osd in osd_df_raw:
+            osd_df.update(
+                {
+                    str(osd["id"]): {
+                        "utilization": osd["utilization"],
+                        "var": osd["var"],
+                        "pgs": osd["pgs"],
+                        "kb": osd["kb"],
+                        "kb_used": osd["kb_used"],
+                        "kb_used_data": osd["kb_used_data"],
+                        "kb_used_omap": osd["kb_used_omap"],
+                        "kb_used_meta": osd["kb_used_meta"],
+                        "kb_avail": osd["kb_avail"],
+                        "weight": osd["crush_weight"],
+                        "reweight": osd["reweight"],
+                        "class": osd["device_class"],
+                    }
+                }
+            )
+
+        # Parse the status data
+        if debug:
+            logger.out("Parse the OSD status data", state="d", prefix="ceph-thread")
+
+        osd_status = dict()
+
+        command = {"prefix": "osd status", "format": "pretty"}
+        try:
+            osd_status_raw = ceph_conn.mon_command(json.dumps(command), b"", timeout=1)[
+                1
+            ].decode("ascii")
+        except Exception as e:
+            logger.out("Failed to obtain OSD status data: {}".format(e), state="w")
+            osd_status_raw = []
+
+        if debug:
+            logger.out("Loop through OSD status data", state="d", prefix="ceph-thread")
+
+        for line in osd_status_raw.split("\n"):
+            # Strip off colour
+            line = re.sub(r"\x1b(\[.*?[@-~]|\].*?(\x07|\x1b\\))", "", line)
+            # Split it for parsing
+            line = line.split()
+
+            # Ceph 14 format:
+            #  ['|', '0', '|', 'hv1.p.u.bonilan.net', '|', '318G', '|', '463G', '|', '213', '|', '1430k', '|', '22', '|', '124k', '|', 'exists,up', '|']
+            # Ceph 16 format:
+            #  ['0', 'hv1.t.u.bonilan.net', '2489M', '236G', '0', '0', '0', '0', 'exists,up']
+
+            # Bypass obviously invalid lines
+            if len(line) < 1:
+                continue
+            elif line[0] == "+":
+                continue
+
+            try:
+                # If line begins with | and second entry is a digit (i.e. OSD ID)
+                if line[0] == "|" and line[1].isdigit():
+                    # Parse the line in Ceph 14 format
+                    osd_id = line[1]
+                    node = line[3].split(".")[0]
+                    used = line[5]
+                    avail = line[7]
+                    wr_ops = line[9]
+                    wr_data = line[11]
+                    rd_ops = line[13]
+                    rd_data = line[15]
+                    state = line[17]
+                # If first entry is a digit (i.e. OSD ID)
+                elif line[0].isdigit():
+                    # Parse the line in Ceph 16 format
+                    osd_id = line[0]
+                    node = line[1].split(".")[0]
+                    used = line[2]
+                    avail = line[3]
+                    wr_ops = line[4]
+                    wr_data = line[5]
+                    rd_ops = line[6]
+                    rd_data = line[7]
+                    state = line[8]
+                # Otherwise, it's the header line and is ignored
+                else:
+                    continue
+            except IndexError:
+                continue
+
+            # I don't know why 2018 me used this construct instead of a normal
+            # dictionary update, but it works so not changing it.
+            # ref: bfbe9188ce830381f3f2fa1da11f1973f08eca8c
+            osd_status.update(
+                {
+                    str(osd_id): {
+                        "node": node,
+                        "used": used,
+                        "avail": avail,
+                        "wr_ops": wr_ops,
+                        "wr_data": wr_data,
+                        "rd_ops": rd_ops,
+                        "rd_data": rd_data,
+                        "state": state,
+                    }
+                }
+            )
+
+        # Merge them together into a single meaningful dict
+        if debug:
+            logger.out("Merge OSD data together", state="d", prefix="ceph-thread")
+
+        osd_stats = dict()
+
+        for osd in osd_list:
+            if zkhandler.read(("osd.node", osd)) == config["node_hostname"]:
+                osds_this_node += 1
+            try:
+                this_dump = osd_dump[osd]
+                this_dump.update(osd_df[osd])
+                this_dump.update(osd_status[osd])
+                osd_stats[osd] = this_dump
+            except KeyError as e:
+                # One or more of the status commands timed out, just continue
+                logger.out(
+                    "Failed to parse OSD stats into dictionary: {}".format(e), state="w"
+                )
+
+        # Upload OSD data for the cluster (primary-only)
+        if this_node.coordinator_state == "primary":
+            if debug:
+                logger.out(
+                    "Trigger updates for each OSD", state="d", prefix="ceph-thread"
+                )
+
+            for osd in osd_list:
+                try:
+                    stats = json.dumps(osd_stats[osd])
+                    zkhandler.write([(("osd.stats", osd), str(stats))])
+                except KeyError as e:
+                    # One or more of the status commands timed out, just continue
+                    logger.out(
+                        "Failed to upload OSD stats from dictionary: {}".format(e),
+                        state="w",
+                    )
+
+    ceph_conn.shutdown()
+
+    queue.put(osds_this_node)
+
+    if debug:
+        logger.out("Thread finished", state="d", prefix="ceph-thread")
+
+
+# VM stats update function
+def collect_vm_stats(logger, config, zkhandler, this_node, queue):
+    debug = config["debug"]
+    if debug:
+        logger.out("Thread starting", state="d", prefix="vm-thread")
+
+    # Connect to libvirt
+    libvirt_name = "qemu:///system"
+    if debug:
+        logger.out("Connecting to libvirt", state="d", prefix="vm-thread")
+    try:
+        lv_conn = libvirt.open(libvirt_name)
+        if lv_conn is None:
+            raise Exception
+    except Exception:
+        logger.out('Failed to open connection to "{}"'.format(libvirt_name), state="e")
+        return
+
+    memalloc = 0
+    memprov = 0
+    vcpualloc = 0
+    # Toggle state management of dead VMs to restart them
+    if debug:
+        logger.out(
+            "Toggle state management of dead VMs to restart them",
+            state="d",
+            prefix="vm-thread",
+        )
+    # Make a copy of the d_domain; if not, and it changes in flight, this can fail
+    fixed_d_domain = this_node.d_domain.copy()
+    for domain, instance in fixed_d_domain.items():
+        if domain in this_node.domain_list:
+            if instance.getstate() == "start" and instance.getnode() == this_node.name:
+                if instance.getdom() is not None:
+                    try:
+                        if instance.getdom().state()[0] != libvirt.VIR_DOMAIN_RUNNING:
+                            logger.out(
+                                "VM {} has failed".format(instance.domname),
+                                state="w",
+                                prefix="vm-thread",
+                            )
+                            raise
+                    except Exception:
+                        # Toggle a state "change"
+                        logger.out(
+                            "Resetting state to {} for VM {}".format(
+                                instance.getstate(), instance.domname
+                            ),
+                            state="i",
+                            prefix="vm-thread",
+                        )
+                        zkhandler.write(
+                            [(("domain.state", domain), instance.getstate())]
+                        )
+        elif instance.getnode() == this_node.name:
+            memprov += instance.getmemory()
+
+    # Get list of running domains from Libvirt
+    running_domains = lv_conn.listAllDomains(libvirt.VIR_CONNECT_LIST_DOMAINS_ACTIVE)
+
+    # Get statistics from any running VMs
+    for domain in running_domains:
+        try:
+            # Get basic information about the VM
+            tree = ElementTree.fromstring(domain.XMLDesc())
+            domain_uuid = domain.UUIDString()
+            domain_name = domain.name()
+
+            # Get all the raw information about the VM
+            if debug:
+                logger.out(
+                    "Getting general statistics for VM {}".format(domain_name),
+                    state="d",
+                    prefix="vm-thread",
+                )
+            (
+                domain_state,
+                domain_maxmem,
+                domain_mem,
+                domain_vcpus,
+                domain_cputime,
+            ) = domain.info()
+            # We can't properly gather stats from a non-running VMs so continue
+            if domain_state != libvirt.VIR_DOMAIN_RUNNING:
+                continue
+            domain_memory_stats = domain.memoryStats()
+            domain_cpu_stats = domain.getCPUStats(True)[0]
+
+            # Add the allocated memory to our memalloc value
+            memalloc += instance.getmemory()
+            memprov += instance.getmemory()
+            vcpualloc += instance.getvcpus()
+        except Exception as e:
+            if debug:
+                try:
+                    logger.out(
+                        "Failed getting VM information for {}: {}".format(
+                            domain.name(), e
+                        ),
+                        state="d",
+                        prefix="vm-thread",
+                    )
+                except Exception:
+                    pass
+            continue
+
+        # Ensure VM is present in the domain_list
+        if domain_uuid not in this_node.domain_list:
+            this_node.domain_list.append(domain_uuid)
+
+        if debug:
+            logger.out(
+                "Getting disk statistics for VM {}".format(domain_name),
+                state="d",
+                prefix="vm-thread",
+            )
+        domain_disk_stats = []
+        try:
+            for disk in tree.findall("devices/disk"):
+                disk_name = disk.find("source").get("name")
+                if not disk_name:
+                    disk_name = disk.find("source").get("file")
+                disk_stats = domain.blockStats(disk.find("target").get("dev"))
+                domain_disk_stats.append(
+                    {
+                        "name": disk_name,
+                        "rd_req": disk_stats[0],
+                        "rd_bytes": disk_stats[1],
+                        "wr_req": disk_stats[2],
+                        "wr_bytes": disk_stats[3],
+                        "err": disk_stats[4],
+                    }
+                )
+        except Exception as e:
+            if debug:
+                try:
+                    logger.out(
+                        "Failed getting disk stats for {}: {}".format(domain.name(), e),
+                        state="d",
+                        prefix="vm-thread",
+                    )
+                except Exception:
+                    pass
+            continue
+
+        if debug:
+            logger.out(
+                "Getting network statistics for VM {}".format(domain_name),
+                state="d",
+                prefix="vm-thread",
+            )
+        domain_network_stats = []
+        try:
+            for interface in tree.findall("devices/interface"):
+                interface_type = interface.get("type")
+                if interface_type not in ["bridge"]:
+                    continue
+                interface_name = interface.find("target").get("dev")
+                interface_bridge = interface.find("source").get("bridge")
+                interface_stats = domain.interfaceStats(interface_name)
+                domain_network_stats.append(
+                    {
+                        "name": interface_name,
+                        "bridge": interface_bridge,
+                        "rd_bytes": interface_stats[0],
+                        "rd_packets": interface_stats[1],
+                        "rd_errors": interface_stats[2],
+                        "rd_drops": interface_stats[3],
+                        "wr_bytes": interface_stats[4],
+                        "wr_packets": interface_stats[5],
+                        "wr_errors": interface_stats[6],
+                        "wr_drops": interface_stats[7],
+                    }
+                )
+        except Exception as e:
+            if debug:
+                try:
+                    logger.out(
+                        "Failed getting network stats for {}: {}".format(
+                            domain.name(), e
+                        ),
+                        state="d",
+                        prefix="vm-thread",
+                    )
+                except Exception:
+                    pass
+            continue
+
+        # Create the final dictionary
+        domain_stats = {
+            "state": libvirt_vm_states[domain_state],
+            "maxmem": domain_maxmem,
+            "livemem": domain_mem,
+            "cpus": domain_vcpus,
+            "cputime": domain_cputime,
+            "mem_stats": domain_memory_stats,
+            "cpu_stats": domain_cpu_stats,
+            "disk_stats": domain_disk_stats,
+            "net_stats": domain_network_stats,
+        }
+
+        if debug:
+            logger.out(
+                "Writing statistics for VM {} to Zookeeper".format(domain_name),
+                state="d",
+                prefix="vm-thread",
+            )
+
+        try:
+            zkhandler.write(
+                [(("domain.stats", domain_uuid), str(json.dumps(domain_stats)))]
+            )
+        except Exception as e:
+            if debug:
+                logger.out(
+                    "Failed to write domain statistics: {}".format(e),
+                    state="d",
+                    prefix="vm-thread",
+                )
+
+    # Close the Libvirt connection
+    lv_conn.close()
+
+    if debug:
+        logger.out(
+            f"VM stats: doms: {len(running_domains)}; memalloc: {memalloc}; memprov: {memprov}; vcpualloc: {vcpualloc}",
+            state="d",
+            prefix="vm-thread",
+        )
+
+    queue.put(len(running_domains))
+    queue.put(memalloc)
+    queue.put(memprov)
+    queue.put(vcpualloc)
+
+    if debug:
+        logger.out("Thread finished", state="d", prefix="vm-thread")
+
+
+# Keepalive update function
+def node_keepalive(logger, config, zkhandler, this_node):
+    debug = config["debug"]
+
+    # Display node information to the terminal
+    if config["log_keepalives"]:
+        if this_node.coordinator_state == "primary":
+            cst_colour = logger.fmt_green
+        elif this_node.coordinator_state == "secondary":
+            cst_colour = logger.fmt_blue
+        else:
+            cst_colour = logger.fmt_cyan
+
+        active_coordinator_state = this_node.coordinator_state
+
+        runtime_start = datetime.now()
+
+    # Set the migration selector in Zookeeper for clients to read
+    if config["enable_hypervisor"]:
+        if this_node.coordinator_state == "primary":
+            try:
+                if (
+                    zkhandler.read("base.config.migration_target_selector")
+                    != config["migration_target_selector"]
+                ):
+                    zkhandler.write(
+                        [
+                            (
+                                "base.config.migration_target_selector",
+                                config["migration_target_selector"],
+                            )
+                        ]
+                    )
+            except Exception:
+                logger.out(
+                    "Failed to set migration target selector in Zookeeper",
+                    state="e",
+                    prefix="main-thread",
+                )
+
+    # Set the upstream IP in Zookeeper for clients to read
+    if config["enable_networking"]:
+        if this_node.coordinator_state == "primary":
+            try:
+                if (
+                    zkhandler.read("base.config.upstream_ip")
+                    != config["upstream_floating_ip"]
+                ):
+                    zkhandler.write(
+                        [("base.config.upstream_ip", config["upstream_floating_ip"])]
+                    )
+            except Exception:
+                logger.out(
+                    "Failed to set upstream floating IP in Zookeeper",
+                    state="e",
+                    prefix="main-thread",
+                )
+
+    # Get past state and update if needed
+    if debug:
+        logger.out(
+            "Get past state and update if needed", state="d", prefix="main-thread"
+        )
+
+    past_state = zkhandler.read(("node.state.daemon", this_node.name))
+    if past_state != "run" and past_state != "shutdown":
+        this_node.daemon_state = "run"
+        zkhandler.write([(("node.state.daemon", this_node.name), "run")])
+    else:
+        this_node.daemon_state = "run"
+
+    # Ensure the primary key is properly set
+    if debug:
+        logger.out(
+            "Ensure the primary key is properly set", state="d", prefix="main-thread"
+        )
+    if this_node.coordinator_state == "primary":
+        if zkhandler.read("base.config.primary_node") != this_node.name:
+            zkhandler.write([("base.config.primary_node", this_node.name)])
+
+    # Run VM statistics collection in separate thread for parallelization
+    if config["enable_hypervisor"]:
+        vm_thread_queue = Queue()
+        vm_stats_thread = Thread(
+            target=collect_vm_stats,
+            args=(logger, config, zkhandler, this_node, vm_thread_queue),
+            kwargs={},
+        )
+        vm_stats_thread.start()
+
+    # Run Ceph status collection in separate thread for parallelization
+    if config["enable_storage"]:
+        ceph_thread_queue = Queue()
+        ceph_stats_thread = Thread(
+            target=collect_ceph_stats,
+            args=(logger, config, zkhandler, this_node, ceph_thread_queue),
+            kwargs={},
+        )
+        ceph_stats_thread.start()
+
+    # Get node performance statistics
+    this_node.memtotal = int(psutil.virtual_memory().total / 1024 / 1024)
+    this_node.memused = int(psutil.virtual_memory().used / 1024 / 1024)
+    this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024)
+    this_node.cpuload = round(os.getloadavg()[0], 2)
+
+    # Join against running threads
+    if config["enable_hypervisor"]:
+        vm_stats_thread.join(timeout=config["keepalive_interval"])
+        if vm_stats_thread.is_alive():
+            logger.out("VM stats gathering exceeded timeout, continuing", state="w")
+    if config["enable_storage"]:
+        ceph_stats_thread.join(timeout=config["keepalive_interval"])
+        if ceph_stats_thread.is_alive():
+            logger.out("Ceph stats gathering exceeded timeout, continuing", state="w")
+
+    # Get information from thread queues
+    if config["enable_hypervisor"]:
+        try:
+            this_node.domains_count = vm_thread_queue.get(
+                timeout=config["keepalive_interval"]
+            )
+            this_node.memalloc = vm_thread_queue.get(
+                timeout=config["keepalive_interval"]
+            )
+            this_node.memprov = vm_thread_queue.get(
+                timeout=config["keepalive_interval"]
+            )
+            this_node.vcpualloc = vm_thread_queue.get(
+                timeout=config["keepalive_interval"]
+            )
+        except Exception:
+            logger.out("VM stats queue get exceeded timeout, continuing", state="w")
+    else:
+        this_node.domains_count = 0
+        this_node.memalloc = 0
+        this_node.memprov = 0
+        this_node.vcpualloc = 0
+
+    if config["enable_storage"]:
+        try:
+            osds_this_node = ceph_thread_queue.get(
+                timeout=(config["keepalive_interval"] - 1)
+            )
+        except Exception:
+            logger.out("Ceph stats queue get exceeded timeout, continuing", state="w")
+            osds_this_node = "?"
+    else:
+        osds_this_node = "0"
+
+    # Set our information in zookeeper
+    keepalive_time = int(time.time())
+    if debug:
+        logger.out("Set our information in zookeeper", state="d", prefix="main-thread")
+    try:
+        zkhandler.write(
+            [
+                (("node.memory.total", this_node.name), str(this_node.memtotal)),
+                (("node.memory.used", this_node.name), str(this_node.memused)),
+                (("node.memory.free", this_node.name), str(this_node.memfree)),
+                (("node.memory.allocated", this_node.name), str(this_node.memalloc)),
+                (("node.memory.provisioned", this_node.name), str(this_node.memprov)),
+                (("node.vcpu.allocated", this_node.name), str(this_node.vcpualloc)),
+                (("node.cpu.load", this_node.name), str(this_node.cpuload)),
+                (
+                    ("node.count.provisioned_domains", this_node.name),
+                    str(this_node.domains_count),
+                ),
+                (
+                    ("node.running_domains", this_node.name),
+                    " ".join(this_node.domain_list),
+                ),
+                (("node.keepalive", this_node.name), str(keepalive_time)),
+            ]
+        )
+    except Exception:
+        logger.out("Failed to set keepalive data", state="e")
+
+    if config["log_keepalives"]:
+        runtime_end = datetime.now()
+        runtime_delta = runtime_end - runtime_start
+        runtime = "{:0.02f}".format(runtime_delta.total_seconds())
+
+        logger.out(
+            "{start_colour}{hostname} keepalive @ {starttime}{nofmt} [{cst_colour}{costate}{nofmt}] in {runtime} seconds".format(
+                start_colour=logger.fmt_purple,
+                cst_colour=logger.fmt_bold + cst_colour,
+                nofmt=logger.fmt_end,
+                hostname=config["node_hostname"],
+                starttime=runtime_start,
+                costate=active_coordinator_state,
+                runtime=runtime,
+            ),
+            state="t",
+        )
+
+        if this_node.maintenance is True:
+            maintenance_colour = logger.fmt_blue
+        else:
+            maintenance_colour = logger.fmt_green
+
+        if isinstance(this_node.health, int):
+            if this_node.health > 90:
+                health_colour = logger.fmt_green
+            elif this_node.health > 50:
+                health_colour = logger.fmt_yellow
+            else:
+                health_colour = logger.fmt_red
+            health_text = str(this_node.health) + "%"
+
+        else:
+            health_colour = logger.fmt_blue
+            health_text = "N/A"
+
+        if config["log_keepalive_cluster_details"]:
+            logger.out(
+                "{bold}Maintenance:{nofmt} {maintenance_colour}{maintenance}{nofmt}  "
+                "{bold}Health:{nofmt} {health_colour}{health}{nofmt}  "
+                "{bold}VMs:{nofmt} {domcount}  "
+                "{bold}OSDs:{nofmt} {osdcount}  "
+                "{bold}Load:{nofmt} {load}  "
+                "{bold}Memory [MiB]: "
+                "{bold}Used:{nofmt} {usedmem}  "
+                "{bold}Free:{nofmt} {freemem}".format(
+                    bold=logger.fmt_bold,
+                    maintenance_colour=maintenance_colour,
+                    health_colour=health_colour,
+                    nofmt=logger.fmt_end,
+                    maintenance=this_node.maintenance,
+                    health=health_text,
+                    domcount=this_node.domains_count,
+                    osdcount=osds_this_node,
+                    load=this_node.cpuload,
+                    freemem=this_node.memfree,
+                    usedmem=this_node.memused,
+                ),
+                state="t",
+            )
+
+    # Look for dead nodes and fence them
+    if not this_node.maintenance:
+        if debug:
+            logger.out(
+                "Look for dead nodes and fence them", state="d", prefix="main-thread"
+            )
+        if config["daemon_mode"] == "coordinator":
+            for node_name in zkhandler.children("base.node"):
+                try:
+                    node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
+                    node_keepalive = int(zkhandler.read(("node.keepalive", node_name)))
+                except Exception:
+                    node_daemon_state = "unknown"
+                    node_keepalive = 0
+
+                # Handle deadtime and fencng if needed
+                # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds
+                # out-of-date while in 'start' state)
+                node_deadtime = int(time.time()) - (
+                    int(config["keepalive_interval"]) * int(config["fence_intervals"])
+                )
+                if node_keepalive < node_deadtime and node_daemon_state == "run":
+                    logger.out(
+                        "Node {} seems dead - starting monitor for fencing".format(
+                            node_name
+                        ),
+                        state="w",
+                    )
+                    zk_lock = zkhandler.writelock(("node.state.daemon", node_name))
+                    with zk_lock:
+                        # Ensures that, if we lost the lock race and come out of waiting,
+                        # we won't try to trigger our own fence thread.
+                        if zkhandler.read(("node.state.daemon", node_name)) != "dead":
+                            fence_thread = Thread(
+                                target=pvcnoded.util.fencing.fence_node,
+                                args=(node_name, zkhandler, config, logger),
+                                kwargs={},
+                            )
+                            fence_thread.start()
+                            # Write the updated data after we start the fence thread
+                            zkhandler.write(
+                                [(("node.state.daemon", node_name), "dead")]
+                            )
--- a/health-daemon/pvchealthd/util/libvirt.py
+++ b/health-daemon/pvchealthd/util/libvirt.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+# libvirt.py - Utility functions for pvcnoded libvirt
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import libvirt
+
+
+def validate_libvirtd(logger, config):
+    if config["enable_hypervisor"]:
+        libvirt_check_name = f'qemu+tcp://{config["node_hostname"]}/system'
+        logger.out(f"Connecting to Libvirt daemon at {libvirt_check_name}", state="i")
+        try:
+            lv_conn = libvirt.open(libvirt_check_name)
+            lv_conn.close()
+        except Exception as e:
+            logger.out(f"Failed to connect to Libvirt daemon: {e}", state="e")
+            return False
+
+    return True
--- a/health-daemon/pvchealthd/util/networking.py
+++ b/health-daemon/pvchealthd/util/networking.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+
+# networking.py - Utility functions for pvcnoded networking
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import daemon_lib.common as common
+
+from time import sleep
+from os import makedirs
+
+
+def setup_sriov(logger, config):
+    logger.out("Setting up SR-IOV device support", state="i")
+
+    # Enable unsafe interrupts for the vfio_iommu_type1 kernel module
+    try:
+        common.run_os_command("modprobe vfio_iommu_type1 allow_unsafe_interrupts=1")
+        with open(
+            "/sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts", "w"
+        ) as mfh:
+            mfh.write("Y")
+    except Exception:
+        logger.out(
+            "Failed to enable vfio_iommu_type1 kernel module; SR-IOV may fail",
+            state="w",
+        )
+
+    # Loop through our SR-IOV NICs and enable the numvfs for each
+    for device in config["sriov_device"]:
+        logger.out(
+            f'Preparing SR-IOV PF {device["phy"]} with {device["vfcount"]} VFs',
+            state="i",
+        )
+        try:
+            with open(
+                f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', "r"
+            ) as vfh:
+                current_vf_count = vfh.read().strip()
+            with open(
+                f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', "w"
+            ) as vfh:
+                vfh.write(str(device["vfcount"]))
+        except FileNotFoundError:
+            logger.out(
+                f'Failed to open SR-IOV configuration for PF {device["phy"]}; device may not support SR-IOV',
+                state="w",
+            )
+        except OSError:
+            logger.out(
+                f'Failed to set SR-IOV VF count for PF {device["phy"]} to {device["vfcount"]}; already set to {current_vf_count}',
+                state="w",
+            )
+
+        if device.get("mtu", None) is not None:
+            logger.out(
+                f'Setting SR-IOV PF {device["phy"]} to MTU {device["mtu"]}', state="i"
+            )
+            common.run_os_command(f'ip link set {device["phy"]} mtu {device["mtu"]} up')
+
+
+def setup_interfaces(logger, config):
+    # Set up the Cluster interface
+    cluster_dev = config["cluster_dev"]
+    cluster_mtu = config["cluster_mtu"]
+    cluster_dev_ip = config["cluster_dev_ip"]
+
+    logger.out(
+        f"Setting up Cluster network interface {cluster_dev} with MTU {cluster_mtu}",
+        state="i",
+    )
+
+    common.run_os_command(f"ip link set {cluster_dev} mtu {cluster_mtu} up")
+
+    logger.out(
+        f"Setting up Cluster network bridge on interface {cluster_dev} with IP {cluster_dev_ip}",
+        state="i",
+    )
+
+    common.run_os_command("brctl addbr brcluster")
+    common.run_os_command(f"brctl addif brcluster {cluster_dev}")
+    common.run_os_command(f"ip link set brcluster mtu {cluster_mtu} up")
+    common.run_os_command(f"ip address add {cluster_dev_ip} dev brcluster")
+
+    # Set up the Storage interface
+    storage_dev = config["storage_dev"]
+    storage_mtu = config["storage_mtu"]
+    storage_dev_ip = config["storage_dev_ip"]
+
+    logger.out(
+        f"Setting up Storage network interface {storage_dev} with MTU {storage_mtu}",
+        state="i",
+    )
+
+    common.run_os_command(f"ip link set {storage_dev} mtu {storage_mtu} up")
+
+    if storage_dev == cluster_dev:
+        if storage_dev_ip != cluster_dev_ip:
+            logger.out(
+                f"Setting up Storage network on Cluster network bridge with IP {storage_dev_ip}",
+                state="i",
+            )
+
+            common.run_os_command(f"ip address add {storage_dev_ip} dev brcluster")
+    else:
+        logger.out(
+            f"Setting up Storage network bridge on interface {storage_dev} with IP {storage_dev_ip}",
+            state="i",
+        )
+
+        common.run_os_command("brctl addbr brstorage")
+        common.run_os_command(f"brctl addif brstorage {storage_dev}")
+        common.run_os_command(f"ip link set brstorage mtu {storage_mtu} up")
+        common.run_os_command(f"ip address add {storage_dev_ip} dev brstorage")
+
+    # Set up the Upstream interface
+    upstream_dev = config["upstream_dev"]
+    upstream_mtu = config["upstream_mtu"]
+    upstream_dev_ip = config["upstream_dev_ip"]
+
+    logger.out(
+        f"Setting up Upstream network interface {upstream_dev} with MTU {upstream_mtu}",
+        state="i",
+    )
+
+    if upstream_dev == cluster_dev:
+        if upstream_dev_ip != cluster_dev_ip:
+            logger.out(
+                f"Setting up Upstream network on Cluster network bridge with IP {upstream_dev_ip}",
+                state="i",
+            )
+
+            common.run_os_command(f"ip address add {upstream_dev_ip} dev brcluster")
+    else:
+        logger.out(
+            f"Setting up Upstream network bridge on interface {upstream_dev} with IP {upstream_dev_ip}",
+            state="i",
+        )
+
+        common.run_os_command("brctl addbr brupstream")
+        common.run_os_command(f"brctl addif brupstream {upstream_dev}")
+        common.run_os_command(f"ip link set brupstream mtu {upstream_mtu} up")
+        common.run_os_command(f"ip address add {upstream_dev_ip} dev brupstream")
+
+    upstream_gateway = config["upstream_gateway"]
+    if upstream_gateway is not None:
+        logger.out(
+            f"Setting up Upstream network default gateway IP {upstream_gateway}",
+            state="i",
+        )
+        if upstream_dev == cluster_dev:
+            common.run_os_command(
+                f"ip route add default via {upstream_gateway} dev brcluster"
+            )
+        else:
+            common.run_os_command(
+                f"ip route add default via {upstream_gateway} dev brupstream"
+            )
+
+    # Set up sysctl tweaks to optimize networking
+    # Enable routing functions
+    common.run_os_command("sysctl net.ipv4.ip_forward=1")
+    common.run_os_command("sysctl net.ipv6.ip_forward=1")
+    # Enable send redirects
+    common.run_os_command("sysctl net.ipv4.conf.all.send_redirects=1")
+    common.run_os_command("sysctl net.ipv4.conf.default.send_redirects=1")
+    common.run_os_command("sysctl net.ipv6.conf.all.send_redirects=1")
+    common.run_os_command("sysctl net.ipv6.conf.default.send_redirects=1")
+    # Accept source routes
+    common.run_os_command("sysctl net.ipv4.conf.all.accept_source_route=1")
+    common.run_os_command("sysctl net.ipv4.conf.default.accept_source_route=1")
+    common.run_os_command("sysctl net.ipv6.conf.all.accept_source_route=1")
+    common.run_os_command("sysctl net.ipv6.conf.default.accept_source_route=1")
+    # Disable RP filtering on Cluster and Upstream interfaces (to allow traffic pivoting)
+    common.run_os_command(f"sysctl net.ipv4.conf.{cluster_dev}.rp_filter=0")
+    common.run_os_command("sysctl net.ipv4.conf.brcluster.rp_filter=0")
+    common.run_os_command(f"sysctl net.ipv4.conf.{upstream_dev}.rp_filter=0")
+    common.run_os_command("sysctl net.ipv4.conf.brupstream.rp_filter=0")
+    common.run_os_command(f"sysctl net.ipv6.conf.{cluster_dev}.rp_filter=0")
+    common.run_os_command("sysctl net.ipv6.conf.brcluster.rp_filter=0")
+    common.run_os_command(f"sysctl net.ipv6.conf.{upstream_dev}.rp_filter=0")
+    common.run_os_command("sysctl net.ipv6.conf.brupstream.rp_filter=0")
+
+    # Stop DNSMasq if it is running
+    common.run_os_command("systemctl stop dnsmasq.service")
+
+    logger.out("Waiting 3 seconds for networking to come up", state="s")
+    sleep(3)
+
+
+def create_nft_configuration(logger, config):
+    if config["enable_networking"]:
+        logger.out("Creating NFT firewall configuration", state="i")
+
+        dynamic_directory = config["nft_dynamic_directory"]
+
+        # Create directories
+        makedirs(f"{dynamic_directory}/networks", exist_ok=True)
+        makedirs(f"{dynamic_directory}/static", exist_ok=True)
+
+        # Set up the base rules
+        nftables_base_rules = f"""# Base rules
+        flush ruleset
+        # Add the filter table and chains
+        add table inet filter
+        add chain inet filter forward {{ type filter hook forward priority 0; }}
+        add chain inet filter input {{ type filter hook input priority 0; }}
+        # Include static rules and network rules
+        include "{dynamic_directory}/static/*"
+        include "{dynamic_directory}/networks/*"
+        """
+
+        # Write the base firewall config
+        nftables_base_filename = f"{dynamic_directory}/base.nft"
+        with open(nftables_base_filename, "w") as nftfh:
+            nftfh.write(nftables_base_rules)
+        common.reload_firewall_rules(nftables_base_filename, logger)
--- a/health-daemon/pvchealthd/util/services.py
+++ b/health-daemon/pvchealthd/util/services.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+# services.py - Utility functions for pvcnoded external services
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import daemon_lib.common as common
+from time import sleep
+
+
+def start_zookeeper(logger, config):
+    if config["daemon_mode"] == "coordinator":
+        logger.out("Starting Zookeeper daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start zookeeper.service")
+
+
+def start_libvirtd(logger, config):
+    if config["enable_hypervisor"]:
+        logger.out("Starting Libvirt daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start libvirtd.service")
+
+
+def start_patroni(logger, config):
+    if config["enable_networking"] and config["daemon_mode"] == "coordinator":
+        logger.out("Starting Patroni daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start patroni.service")
+
+
+def start_frrouting(logger, config):
+    if config["enable_networking"] and config["daemon_mode"] == "coordinator":
+        logger.out("Starting FRRouting daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start frr.service")
+
+
+def start_ceph_mon(logger, config):
+    if config["enable_storage"] and config["daemon_mode"] == "coordinator":
+        logger.out("Starting Ceph Monitor daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command(
+            f'systemctl start ceph-mon@{config["node_hostname"]}.service'
+        )
+
+
+def start_ceph_mgr(logger, config):
+    if config["enable_storage"] and config["daemon_mode"] == "coordinator":
+        logger.out("Starting Ceph Manager daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command(
+            f'systemctl start ceph-mgr@{config["node_hostname"]}.service'
+        )
+
+
+def start_keydb(logger, config):
+    if (config["enable_api"] or config["enable_worker"]) and config[
+        "daemon_mode"
+    ] == "coordinator":
+        logger.out("Starting KeyDB daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start keydb-server.service")
+
+
+def start_worker(logger, config):
+    if config["enable_worker"]:
+        logger.out("Starting Celery Worker daemon", state="i")
+        # TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
+        common.run_os_command("systemctl start pvcworkerd.service")
+
+
+def start_system_services(logger, config):
+    start_zookeeper(logger, config)
+    start_libvirtd(logger, config)
+    start_patroni(logger, config)
+    start_frrouting(logger, config)
+    start_ceph_mon(logger, config)
+    start_ceph_mgr(logger, config)
+    start_keydb(logger, config)
+    start_worker(logger, config)
+
+    logger.out("Waiting 10 seconds for daemons to start", state="s")
+    sleep(10)
--- a/health-daemon/pvchealthd/util/zookeeper.py
+++ b/health-daemon/pvchealthd/util/zookeeper.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+
+# <Filename> - <Description>
+# zookeeper.py - Utility functions for pvcnoded Zookeeper connections
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+##############################################################################
+
+from daemon_lib.zkhandler import ZKHandler
+
+import os
+import time
+
+
+def connect(logger, config):
+    # Create an instance of the handler
+    zkhandler = ZKHandler(config, logger)
+
+    try:
+        logger.out(
+            "Connecting to Zookeeper on coordinator nodes {}".format(
+                config["coordinators"]
+            ),
+            state="i",
+        )
+        # Start connection
+        zkhandler.connect(persistent=True)
+    except Exception as e:
+        logger.out(
+            "ERROR: Failed to connect to Zookeeper cluster: {}".format(e), state="e"
+        )
+        os._exit(1)
+
+    logger.out("Validating Zookeeper schema", state="i")
+
+    try:
+        node_schema_version = int(
+            zkhandler.read(("node.data.active_schema", config["node_hostname"]))
+        )
+    except Exception:
+        node_schema_version = int(zkhandler.read("base.schema.version"))
+        zkhandler.write(
+            [
+                (
+                    ("node.data.active_schema", config["node_hostname"]),
+                    node_schema_version,
+                )
+            ]
+        )
+
+    # Load in the current node schema version
+    zkhandler.schema.load(node_schema_version)
+
+    # Record the latest intalled schema version
+    latest_schema_version = zkhandler.schema.find_latest()
+    logger.out("Latest installed schema is {}".format(latest_schema_version), state="i")
+    zkhandler.write(
+        [(("node.data.latest_schema", config["node_hostname"]), latest_schema_version)]
+    )
+
+    # If we are the last node to get a schema update, fire the master update
+    if latest_schema_version > node_schema_version:
+        node_latest_schema_version = list()
+        for node in zkhandler.children("base.node"):
+            node_latest_schema_version.append(
+                int(zkhandler.read(("node.data.latest_schema", node)))
+            )
+
+        # This is true if all elements of the latest schema version are identical to the latest version,
+        # i.e. they have all had the latest schema installed and ready to load.
+        if node_latest_schema_version.count(latest_schema_version) == len(
+            node_latest_schema_version
+        ):
+            zkhandler.write([("base.schema.version", latest_schema_version)])
+
+    return zkhandler, node_schema_version
+
+
+def validate_schema(logger, zkhandler):
+    # Validate our schema against the active version
+    if not zkhandler.schema.validate(zkhandler, logger):
+        logger.out("Found schema violations, applying", state="i")
+        zkhandler.schema.apply(zkhandler)
+    else:
+        logger.out("Schema successfully validated", state="o")
+
+
+def setup_node(logger, config, zkhandler):
+    # Check if our node exists in Zookeeper, and create it if not
+    if config["daemon_mode"] == "coordinator":
+        init_routerstate = "secondary"
+    else:
+        init_routerstate = "client"
+
+    if zkhandler.exists(("node", config["node_hostname"])):
+        logger.out(
+            f"Node is {logger.fmt_green}present{logger.fmt_end} in Zookeeper", state="i"
+        )
+        # Update static data just in case it's changed
+        zkhandler.write(
+            [
+                (("node", config["node_hostname"]), config["daemon_mode"]),
+                (("node.mode", config["node_hostname"]), config["daemon_mode"]),
+                (("node.state.daemon", config["node_hostname"]), "init"),
+                (("node.state.router", config["node_hostname"]), init_routerstate),
+                (
+                    ("node.data.static", config["node_hostname"]),
+                    " ".join(config["static_data"]),
+                ),
+                (
+                    ("node.data.pvc_version", config["node_hostname"]),
+                    config["daemon_version"],
+                ),
+                (
+                    ("node.ipmi.hostname", config["node_hostname"]),
+                    config["ipmi_hostname"],
+                ),
+                (
+                    ("node.ipmi.username", config["node_hostname"]),
+                    config["ipmi_username"],
+                ),
+                (
+                    ("node.ipmi.password", config["node_hostname"]),
+                    config["ipmi_password"],
+                ),
+            ]
+        )
+    else:
+        logger.out(
+            f"Node is {logger.fmt_red}absent{logger.fmt_end} in Zookeeper; adding new node",
+            state="i",
+        )
+        keepalive_time = int(time.time())
+        zkhandler.write(
+            [
+                (("node", config["node_hostname"]), config["daemon_mode"]),
+                (("node.keepalive", config["node_hostname"]), str(keepalive_time)),
+                (("node.mode", config["node_hostname"]), config["daemon_mode"]),
+                (("node.state.daemon", config["node_hostname"]), "init"),
+                (("node.state.domain", config["node_hostname"]), "flushed"),
+                (("node.state.router", config["node_hostname"]), init_routerstate),
+                (
+                    ("node.data.static", config["node_hostname"]),
+                    " ".join(config["static_data"]),
+                ),
+                (
+                    ("node.data.pvc_version", config["node_hostname"]),
+                    config["daemon_version"],
+                ),
+                (
+                    ("node.ipmi.hostname", config["node_hostname"]),
+                    config["ipmi_hostname"],
+                ),
+                (
+                    ("node.ipmi.username", config["node_hostname"]),
+                    config["ipmi_username"],
+                ),
+                (
+                    ("node.ipmi.password", config["node_hostname"]),
+                    config["ipmi_password"],
+                ),
+                (("node.memory.total", config["node_hostname"]), "0"),
+                (("node.memory.used", config["node_hostname"]), "0"),
+                (("node.memory.free", config["node_hostname"]), "0"),
+                (("node.memory.allocated", config["node_hostname"]), "0"),
+                (("node.memory.provisioned", config["node_hostname"]), "0"),
+                (("node.vcpu.allocated", config["node_hostname"]), "0"),
+                (("node.cpu.load", config["node_hostname"]), "0.0"),
+                (("node.running_domains", config["node_hostname"]), "0"),
+                (("node.count.provisioned_domains", config["node_hostname"]), "0"),
+                (("node.count.networks", config["node_hostname"]), "0"),
+            ]
+        )