Split health monitoring into discrete daemon/pkg

This commit is contained in:
2023-11-29 15:36:49 -05:00
parent 74a416165d
commit 41f4e4fb2f
40 changed files with 3032 additions and 33 deletions

169
health-daemon/plugins/disk Normal file
View File

@ -0,0 +1,169 @@
#!/usr/bin/env python3
# disk.py - PVC Monitoring example plugin for disk (system + OSD)
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the system and OSD disks for errors and faults and return
# a health delta corresponding to severity.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "disk"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
from daemon_lib.common import run_os_command
from json import loads
_, _all_disks, _ = run_os_command("lsblk --json --paths --include 8,259")
try:
all_disks = loads(_all_disks)
except Exception as e:
return f"Error loading lsblk JSON: {e}"
disk_details = list()
def get_smartinfo(disk, extra_opt=""):
_, _smart_info, _ = run_os_command(f"smartctl --info --json {extra_opt} {disk}")
try:
smart_info = loads(_smart_info)
except Exception as e:
return None
return smart_info
for disk in [disk["name"] for disk in all_disks['blockdevices']]:
extra_opt = ""
smart_info = get_smartinfo(disk)
if smart_info is None or smart_info["smartctl"]["exit_status"] > 1:
continue
elif smart_info["smartctl"]["exit_status"] == 1:
if "requires option" in smart_info["smartctl"]["messages"][0]["string"]:
extra_opt = smart_info["smartctl"]["messages"][0]["string"].split("'")[1].replace('N','0')
smart_info = get_smartinfo(disk, extra_opt)
if smart_info is None or smart_info["smartctl"]["exit_status"] > 0:
continue
else:
continue
disk_type = smart_info["device"]["type"]
disk_details.append((disk, extra_opt, disk_type))
self.disk_details = disk_details
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Re-run setup each time to ensure the disk details are current
self.setup()
# Run any imports first
from daemon_lib.common import run_os_command
from json import loads
health_delta = 0
messages = list()
for _disk in self.disk_details:
disk = _disk[0]
extra_opt = _disk[1]
disk_type = _disk[2]
_, _smart_info, _ = run_os_command(f"smartctl --all --json {extra_opt} {disk}")
try:
smart_info = loads(_smart_info)
except Exception as e:
health_delta += 10
messages.append(f"{disk} failed to load SMART data")
continue
if disk_type == 'nvme':
for attribute in smart_info.get('nvme_smart_health_information_log', {}).items():
if attribute[0] == "critical_warning" and attribute[1] > 0:
health_delta += 10
messages.append(f"{disk} critical warning value {attribute[1]}")
if attribute[0] == "media_errors" and attribute[1] > 0:
health_delta += 10
messages.append(f"{disk} media errors value {attribute[1]}")
if attribute[0] == "percentage_used" and attribute[1] > 90:
health_delta += 10
messages.append(f"{disk} percentage used value {attribute[1]}%")
else:
for attribute in smart_info.get('ata_smart_attributes', {}).get('table', []):
if attribute["when_failed"]:
health_delta += 10
messages.append(f"{disk} attribute {attribute['name']} value {attribute['raw']['value']}")
if len(messages) < 1:
messages.append(f"All {len(self.disk_details)} checked disks report OK: {', '.join([disk[0] for disk in self.disk_details])}")
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(', '.join(messages))
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

162
health-daemon/plugins/dpkg Normal file
View File

@ -0,0 +1,162 @@
#!/usr/bin/env python3
# dpkg.py - PVC Monitoring example plugin for dpkg status
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the system dpkg status is as expected, with no invalid
# packages or obsolete configuration files, and will return a 1 health delta for each
# flaw in invalid packages, upgradable packages, and obsolete config files.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "dpkg"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from re import match
import daemon_lib.common as pvc_common
# Get Debian version
with open('/etc/debian_version', 'r') as fh:
debian_version = fh.read().strip()
# Get a list of dpkg packages for analysis
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/dpkg --list")
# Get a list of installed packages and states
packages = list()
for dpkg_line in stdout.split('\n'):
if match('^[a-z][a-z] ', dpkg_line):
line_split = dpkg_line.split()
package_state = line_split[0]
package_name = line_split[1]
packages.append((package_name, package_state))
count_ok = 0
count_inconsistent = 0
list_inconsistent = list()
for package in packages:
if package[1] == "ii":
count_ok += 1
else:
count_inconsistent += 1
list_inconsistent.append(package[0])
# Get upgradable packages
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/apt list --upgradable")
list_upgradable = list()
for apt_line in stdout.split('\n'):
if match('^[a-z][a-z] ', apt_line):
line_split = apt_line.split('/')
package_name = line_split[0]
list_upgradable.append(package_name)
count_upgradable = len(list_upgradable)
# Get obsolete config files (dpkg-*, ucf-*, or update-* under /etc)
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/find /etc -type f -a \( -name '*.dpkg-*' -o -name '*.ucf-*' -o -name '*.update-*' \)")
obsolete_conffiles = list()
for conffile_line in stdout.split('\n'):
if conffile_line:
obsolete_conffiles.append(conffile_line)
count_obsolete_conffiles = len(obsolete_conffiles)
# Set health_delta based on the results
health_delta = 0
if count_inconsistent > 0:
health_delta += 1
if count_upgradable > 0:
health_delta += 1
if count_obsolete_conffiles > 0:
health_delta += 1
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Craft the message
message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Set the detailed data in our local PluginResult object
detailed_data = {
"debian_version": debian_version,
"obsolete_conffiles": obsolete_conffiles,
"inconsistent_packages": list_inconsistent,
"upgradable_packages": list_upgradable,
}
self.plugin_result.set_data(detailed_data)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

108
health-daemon/plugins/edac Normal file
View File

@ -0,0 +1,108 @@
#!/usr/bin/env python3
# edac.py - PVC Monitoring example plugin for EDAC
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the system's EDAC registers and report any failures.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "edac"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
import daemon_lib.common as common
from re import match, search
# Get edac-util output
retcode, stdout, stderr = common.run_os_command('/usr/bin/edac-util')
# If there's no errors, we're OK
if match(r'^edac-util: No errors to report.', stdout):
health_delta = 0
message = "EDAC reports no errors"
else:
health_delta = 0
message = "EDAC reports errors: "
errors = list()
for line in stdout.split('\n'):
if match(r'^mc[0-9]: csrow', line):
if 'Uncorrected' in line:
health_delta = 50
errors.append(' '.join(line.split()[2:]))
message += ', '.join(errors)
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

247
health-daemon/plugins/hwrd Normal file
View File

@ -0,0 +1,247 @@
#!/usr/bin/env python3
# hwrd.py - PVC Monitoring example plugin for hardware RAID Arrays
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check any hardwrae RAID virtual disks for health and report errors.
# Supports Dell BOSS cards, LSI/Avago/Broadcom MegaRAID, and HP SmartArray RAID.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "hwrd"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def check_dellboss(self):
# Run any imports first
from daemon_lib.common import run_os_command
from re import match
health_delta = 0
messages = list()
_dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
if _dellboss_ret != 0:
health_delta = 50
messages.append("Error running MVCLI command")
else:
arrays = list()
idx = None
for line in _dellboss_list.split('\n'):
if match(r"^id:", line):
idx = int(line.split(":")[-1].strip())
arrays.append(dict())
if match(r"^name:", line):
arrays[idx]["name"] = line.split(":")[-1].strip()
if match(r"^status:", line):
arrays[idx]["status"] = line.split(":")[-1].strip()
for idx, array in enumerate(arrays):
if array["status"] != "functional":
health_delta += 50
messages.append(f"RAID Dell BOSS ID {idx} (Name: {array['name']}, State: {array['status']})")
if len(messages) < 1:
messages.append(f"No valid RAID arrays found")
return health_delta, messages
def check_megaraid(self):
# Run any imports first
from daemon_lib.common import run_os_command
from re import match
health_delta = 0
messages = list()
_megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
if _megaraid_ret != 0:
health_delta = 50
messages.append("Error running MegaCLI command")
else:
vd_list = _megaraid_list.split('\n\n\n')
for idx, _vd in enumerate(vd_list):
vd = _vd.split('\n')
if "Virtual Drive Information" not in vd[2]:
continue
raid_name = None
raid_count = 0
raid_state = None
for entry in vd:
if len(entry.split(':')) < 2:
continue
entry_key = entry.split(':')[0].strip()
entry_value = entry.split(':')[1].strip()
if entry_key == "State":
raid_state = entry_value
if entry_key == "Name":
raid_name = entry_value
if entry_key == "Number Of Drives":
raid_count = entry_value
if raid_state is None or raid_name is None or raid_count == 0:
health_delta += 10
messages.append(f"RAID ID {idx} did not report useful values")
continue
if raid_state != "Optimal":
health_delta += 50
messages.append(f"RAID MegaRAID ID {idx} (Name: {raid_name}, Disks: {raid_count}, State: {raid_state})")
if len(messages) < 1:
messages.append(f"No valid RAID arrays found")
return health_delta, messages
def check_hpsa(self):
# Run any imports first
from daemon_lib.common import run_os_command
from re import match, findall
health_delta = 0
messages = list()
_hparray_ret, _hparray_list, _ = run_os_command(f"ssacli ctrl slot={self.controller_slot} ld all show")
if _hparray_ret != 0:
health_delta = 50
messages.append("Error running SSACLI command")
else:
vd_lines = _hparray_list.split('\n\n')
arrays = list()
cur_array = None
for idx, _line in enumerate(vd_lines):
line = _line.strip()
if match(r"^Array", line):
cur_array = line
if match(r"^logicaldrive", line) and cur_array is not None:
arrays.append(f"{cur_array} {line}")
for array in arrays:
if "OK" not in array:
health_delta += 50
messages.append(f"RAID HPSA {array}")
if len(messages) < 1:
messages.append(f"No valid RAID arrays found")
return health_delta, messages
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
from daemon_lib.common import run_os_command
from re import match, findall
self.raid_type = list()
_dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
if _dellboss_ret == 0:
# If this returns 0 at all, there's a valid BOSS card to manage
self.raid_type.append("dellboss")
_megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
if _megaraid_ret == 0:
vd_list = _megaraid_list.split('\n\n\n')
for idx, _vd in enumerate(vd_list):
vd = _vd.split('\n')
if "Virtual Drive Information" in vd[2]:
self.raid_type.append("megaraid")
_hpraid_ret, _hpraid_list, _ = run_os_command("ssacli ctrl all show status")
if _hpraid_ret == 0:
for line in _hpraid_list.split('\n'):
if match(r"^Smart", line):
controller_slots = findall("Slot ([0-9])", line)
if len(controller_slots) > 0:
self.raid_type.append("hpsa")
self.controller_slot = controller_slots[0]
if len(self.raid_type) < 1:
return "No hardware RAID management commands found"
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
"""
health_delta = 0
messages = list()
raid_function_map = {
"megaraid": self.check_megaraid,
"hpsa": self.check_hpsa,
"dellboss": self.check_dellboss,
}
for raid_type in self.raid_type:
_health_delta, _messages = raid_function_map.get(raid_type)()
health_delta += _health_delta
messages += _messages
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(', '.join(messages))
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

109
health-daemon/plugins/ipmi Normal file
View File

@ -0,0 +1,109 @@
#!/usr/bin/env python3
# ipmi.py - PVC Monitoring example plugin for IPMI
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check whether the system IPMI is reachable.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "ipmi"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not ipmi in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from daemon_lib.common import run_os_command
# Check the node's IPMI interface
ipmi_hostname = self.config["ipmi_hostname"]
ipmi_username = self.config["ipmi_username"]
ipmi_password = self.config["ipmi_password"]
retcode, _, _ = run_os_command(
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status",
timeout=5
)
if retcode > 0:
# Set the health delta to 10 (subtract 10 from the total of 100)
health_delta = 10
# Craft a message that can be used by the clients
message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding"
else:
# Set the health delta to 0 (no change)
health_delta = 0
# Craft a message that can be used by the clients
message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding"
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

106
health-daemon/plugins/kydb Normal file
View File

@ -0,0 +1,106 @@
#!/usr/bin/env python3
# kydb.py - PVC Monitoring example plugin for KeyDB/Redis
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the Libvirt daemon instance on the node for operation.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "kydb"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not kydb in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from redis import Redis
rd_conn = None
# Set the health delta to 0 (no change)
health_delta = 0
# Craft a message that can be used by the clients
message = "Successfully connected to Libvirtd on localhost"
# Check the Zookeeper connection
try:
rd_conn = Redis(host='localhost', port=6379, decode_responses=True)
data = rd_conn.info()
except Exception as e:
health_delta = 50
message = f"Failed to connect to KeyDB/Redis: {e}"
finally:
del rd_conn
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

107
health-daemon/plugins/lbvt Normal file
View File

@ -0,0 +1,107 @@
#!/usr/bin/env python3
# lbvt.py - PVC Monitoring example plugin for Libvirtd
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the Libvirt daemon instance on the node for operation.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "lbvt"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not lbvt in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from libvirt import openReadOnly as lvopen
lv_conn = None
# Set the health delta to 0 (no change)
health_delta = 0
# Craft a message that can be used by the clients
message = "Successfully connected to Libvirtd on localhost"
# Check the Zookeeper connection
try:
lv_conn = lvopen(f"qemu+tcp://{self.this_node.name}/system")
data = lv_conn.getHostname()
except Exception as e:
health_delta = 50
message = f"Failed to connect to Libvirtd: {e}"
finally:
if lv_conn is not None:
lv_conn.close()
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

109
health-daemon/plugins/load Normal file
View File

@ -0,0 +1,109 @@
#!/usr/bin/env python3
# load.py - PVC Monitoring example plugin for load
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the system load against the total number of CPU cores.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "load"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from os import getloadavg
from psutil import cpu_count
# Get the current 1-minute system load average
load_average = float(round(getloadavg()[0], 2))
# Get the number of CPU cores
cpu_cores = cpu_count()
# Check that the load average is greater or equal to the cpu count
if load_average > float(cpu_cores):
# Set the health delta to 10 (subtract 10 from the total of 100)
health_delta = 50
# Craft a message that can be used by the clients
message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
else:
# Set the health delta to 0 (no change)
health_delta = 0
# Craft a message that can be used by the clients
message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

198
health-daemon/plugins/nics Normal file
View File

@ -0,0 +1,198 @@
#!/usr/bin/env python3
# nics.py - PVC Monitoring example plugin for NIC interfaces
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the network interfaces of the host, specifically for speed
# and 802.3ad status (if applicable).
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "nics"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
import daemon_lib.common as common
from re import match, search, findall
messages = list()
health_delta = 0
# Get a list of the various underlying devices
_core_nics = set()
for dev in [
self.config['bridge_dev'],
self.config['upstream_dev'],
self.config['cluster_dev'],
self.config['storage_dev'],
]:
with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
_devtype = uevent.readlines()[0].split('=')[-1].strip()
if _devtype == 'vlan':
with open(f"/proc/net/vlan/{dev}") as devfh:
vlan_info = devfh.read().split('\n')
for line in vlan_info:
if match(r'^Device:', line):
dev = line.split()[-1]
_core_nics.add(dev)
core_nics = sorted(list(_core_nics))
for dev in core_nics:
with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
_devtype = uevent.readlines()[0].split('=')[-1].strip()
if _devtype == "bond":
syspath = f"/proc/net/bonding/{dev}"
with open(syspath) as devfh:
bonding_stats = devfh.read()
_, _mode, _info, *_slaves = bonding_stats.split('\n\n')
slave_interfaces = list()
for slavedev in _slaves:
lines = slavedev.split('\n')
for line in lines:
if match(r'^Slave Interface:', line):
interface_name = line.split()[-1]
if match(r'^MII Status:', line):
interface_status = line.split()[-1]
if match(r'^Speed:', line):
try:
interface_speed_mbps = int(line.split()[-2])
except Exception:
interface_speed_mbps = 0
if match(r'^Duplex:', line):
interface_duplex = line.split()[-1]
slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex))
# Ensure at least 2 slave interfaces are up
slave_interface_up_count = 0
for slave_interface in slave_interfaces:
if slave_interface[1] == 'up':
slave_interface_up_count += 1
if slave_interface_up_count < len(slave_interfaces):
messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves")
health_delta += 10
else:
messages.append(f"{dev} OK with {slave_interface_up_count} active slaves")
# Get ethtool supported speeds for slave interfaces
supported_link_speeds = set()
for slave_interface in slave_interfaces:
slave_dev = slave_interface[0]
_, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}")
in_modes = False
for line in ethtool_stdout.split('\n'):
if search('Supported link modes:', line):
in_modes = True
if search('Supported pause frame use:', line):
in_modes = False
break
if in_modes:
speed = int(findall(r'\d+', line.split()[-1])[0])
supported_link_speeds.add(speed)
else:
# Get ethtool supported speeds for interface
supported_link_speeds = set()
_, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}")
in_modes = False
for line in ethtool_stdout.split('\n'):
if search('Supported link modes:', line):
in_modes = True
if search('Supported pause frame use:', line):
in_modes = False
break
if in_modes:
speed = int(line.split()[-1].replace('baseT', '').split('/')[0])
supported_link_speeds.add(speed)
max_supported_link_speed = sorted(list(supported_link_speeds))[-1]
# Ensure interface is running at its maximum speed
with open(f"/sys/class/net/{dev}/speed") as devfh:
dev_speed = int(devfh.read())
if dev_speed < max_supported_link_speed:
messages.append(f"{dev} DEGRADED at {dev_speed} Mbps")
health_delta += 10
else:
messages.append(f"{dev} OK at {dev_speed} Mbps")
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(', '.join(messages))
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

141
health-daemon/plugins/psql Normal file
View File

@ -0,0 +1,141 @@
#!/usr/bin/env python3
# psql.py - PVC Monitoring example plugin for Postgres/Patroni
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the Patroni PostgreSQL instance on the node for operation.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "psql"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from psycopg2 import connect
conn_metadata = None
cur_metadata = None
conn_dns = None
cur_dns = None
# Set the health delta to 0 (no change)
health_delta = 0
# Craft a message that can be used by the clients
message = "Successfully connected to PostgreSQL databases on localhost"
# Check the Metadata database (primary)
try:
conn_metadata = connect(
host=self.this_node.name,
port=self.config["metadata_postgresql_port"],
dbname=self.config["metadata_postgresql_dbname"],
user=self.config["metadata_postgresql_user"],
password=self.config["metadata_postgresql_password"],
)
cur_metadata = conn_metadata.cursor()
cur_metadata.execute("""SELECT * FROM alembic_version""")
data = cur_metadata.fetchone()
except Exception as e:
health_delta = 50
err = str(e).split('\n')[0]
message = f"Failed to connect to PostgreSQL database {self.config['metadata_postgresql_dbname']}: {err}"
finally:
if cur_metadata is not None:
cur_metadata.close()
if conn_metadata is not None:
conn_metadata.close()
if health_delta == 0:
# Check the PowerDNS database (secondary)
try:
conn_pdns = connect(
host=self.this_node.name,
port=self.config["pdns_postgresql_port"],
dbname=self.config["pdns_postgresql_dbname"],
user=self.config["pdns_postgresql_user"],
password=self.config["pdns_postgresql_password"],
)
cur_pdns = conn_pdns.cursor()
cur_pdns.execute("""SELECT * FROM supermasters""")
data = cur_pdns.fetchone()
except Exception as e:
health_delta = 50
err = str(e).split('\n')[0]
message = f"Failed to connect to PostgreSQL database {self.config['pdns_postgresql_dbname']}: {err}"
finally:
if cur_pdns is not None:
cur_pdns.close()
if conn_pdns is not None:
conn_pdns.close()
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

138
health-daemon/plugins/psur Normal file
View File

@ -0,0 +1,138 @@
#!/usr/bin/env python3
# psur.py - PVC Monitoring example plugin for PSU Redundancy
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check IPMI for power supply reundancy status.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "psur"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not load in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
# Run any imports first
from daemon_lib.common import run_os_command
from re import match
_ipmitool_ret, _ipmitool_list, _ = run_os_command("ipmitool sdr type 'Power Supply'")
if _ipmitool_ret != 0:
return "Error running ipmitool command"
else:
search_values = [
"PS Redundancy", # Dell PowerEdge
"Power Supplies", # HP ProLiant
"PS_RDNDNT_MODE", # Cisco UCS
]
reading_lines = [l for l in _ipmitool_list.split('\n') if len(l.split('|')) > 0 and l.split('|')[0].strip() in search_values]
if len(reading_lines) < 1:
return "No valid input power sensors found"
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from daemon_lib.common import run_os_command
from re import match
health_delta = 0
messages = list()
_ipmitool_ret, _ipmitool_list, _ = run_os_command("ipmitool sdr type 'Power Supply'")
if _ipmitool_ret != 0 or len(_ipmitool_list.split('\n')) < 1:
health_delta = 0
messages.append("Error running ipmitool command")
else:
search_values = [
"PS Redundancy", # Dell PowerEdge
"Power Supplies", # HP ProLiant
"PS_RDNDNT_MODE", # Cisco UCS
]
reading_lines = [l for l in _ipmitool_list.split('\n') if len(l.split('|')) > 0 and l.split('|')[0].strip() in search_values]
if len(reading_lines) > 0:
for reading_line in reading_lines:
reading_sensor = reading_line.split('|')[1].strip()
reading_text = reading_line.split('|')[-1].strip()
if reading_text == "Fully Redundant":
health_delta += 0
messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
elif reading_text == "No Reading":
health_delta += 5
messages.append(f"Input power sensor {reading_sensor} reports {reading_text} (PSU redundancy not configured?)")
else:
health_delta += 10
messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
else:
health_delta = 5
messages.append("No valid input power sensors found, but configured")
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(', '.join(messages))
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass

109
health-daemon/plugins/zkpr Normal file
View File

@ -0,0 +1,109 @@
#!/usr/bin/env python3
# zkpr.py - PVC Monitoring example plugin for Zookeeper
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
# This script provides an example of a PVC monitoring plugin script. It will create
# a simple plugin to check the Zookeeper instance on the node for operation.
# This script can thus be used as an example or reference implementation of a
# PVC monitoring pluginscript and expanded upon as required.
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
# of the role of each function is provided in context of the example; see the other
# examples for more potential uses.
# WARNING:
#
# This script will run in the context of the node daemon keepalives as root.
# DO NOT install untrusted, unvetted plugins under any circumstances.
# This import is always required here, as MonitoringPlugin is used by the
# MonitoringPluginScript class
from pvchealthd.objects.MonitoringInstance import MonitoringPlugin
# A monitoring plugin script must always expose its nice name, which must be identical to
# the file name
PLUGIN_NAME = "zkpr"
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
class MonitoringPluginScript(MonitoringPlugin):
def setup(self):
"""
setup(): Perform special setup steps during node daemon startup
This step is optional and should be used sparingly.
If you wish for the plugin to not zkpr in certain conditions, do any checks here
and return a non-None failure message to indicate the error.
"""
pass
def run(self, coordinator_state=None):
"""
run(): Perform the check actions and return a PluginResult object
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
"""
# Run any imports first
from kazoo.client import KazooClient, KazooState
zk_conn = None
# Set the health delta to 0 (no change)
health_delta = 0
# Craft a message that can be used by the clients
message = "Successfully connected to Zookeeper on localhost"
# Check the Zookeeper connection
try:
zk_conn = KazooClient(hosts=[f"{self.this_node.name}:2181"], timeout=1, read_only=True)
zk_conn.start(timeout=1)
data = zk_conn.get('/schema/version')
except Exception as e:
health_delta = 50
message = f"Failed to connect to Zookeeper: {e}"
finally:
if zk_conn is not None:
zk_conn.stop()
zk_conn.close()
# Set the health delta in our local PluginResult object
self.plugin_result.set_health_delta(health_delta)
# Set the message in our local PluginResult object
self.plugin_result.set_message(message)
# Return our local PluginResult object
return self.plugin_result
def cleanup(self):
"""
cleanup(): Perform special cleanup steps during node daemon termination
This step is optional and should be used sparingly.
"""
pass