Bump version to 0.9.78

Remove spurious comments
Always create RBDs with bytes value
2023-09-30 12:57:55 -04:00 · 2023-09-30 12:37:58 -04:00 · 2023-09-30 12:37:43 -04:00 · 2023-09-29 16:19:22 -04:00 · 2023-09-21 02:32:53 -04:00 · 2023-09-19 11:05:55 -04:00
20 changed files with 419 additions and 46 deletions
--- a/.version
+++ b/.version
@@ -1 +1 @@
-0.9.73
+0.9.78
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,32 @@
 ## PVC Changelog

+###### [v0.9.78](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.78)
+
+  * [API, Client CLI] Fixes several bugs around image uploads; adds a new query parameter for non-raw images
+  * [API] Ensures RBD images are created with a raw bytes value to avoid rounding errors
+
+###### [v0.9.77](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.77)
+
+  * [Client CLI] Fixes a bug from a bad library import
+
+###### [v0.9.76](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.76)
+
+  * [API, Client CLI] Corrects some missing node states for fencing in status output
+
+###### [v0.9.75](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.75)
+
+  * [Node Daemon] Adds a startup message about IPMI when succeeding
+  * [Node Daemon] Fixes a bug in fencing allowing non-failing VMs to migrate
+  * [Node Daemon] Adds rounding to load average in load plugin for consistency
+
+###### [v0.9.74](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.74)
+
+  * [Docs] Removes docs from the main repo
+  * [Client CLI] Ensures that "provision" VMs are shown in the right colour
+  * [Node Daemon] Separates the node monitoring subsystem into its own thread with a longer, customizable update interval
+  * [Node Daemon] Adds checks for PSU input power reundancy (psur) and hardware RAID (hwrd)
+  * [Node Daemon] Updates when Keepalive start messages are printed (end of run, with runtime) to align with new monitoring messages
+
 ###### [v0.9.73](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.73)

  * [Node Daemon] Fixes a bug creating monitoring instance
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@@ -27,7 +27,7 @@ from ssl import SSLContext, TLSVersion
 from distutils.util import strtobool as dustrtobool

 # Daemon version
-version = "0.9.73"
+version = "0.9.78"

 # API version
 API_VERSION = 1.0
--- a/api-daemon/pvcapid/flaskapi.py
+++ b/api-daemon/pvcapid/flaskapi.py
@@ -5088,7 +5088,12 @@ class API_Storage_Ceph_Volume_Element_Upload(Resource):
                "required": True,
                "location": ["args"],
                "helptext": "A source image format must be specified.",
-            }
+            },
+            {
+                "name": "file_size",
+                "required": False,
+                "location": ["args"],
+            },
        ]
    )
    @Authenticator
@@ -5113,6 +5118,11 @@ class API_Storage_Ceph_Volume_Element_Upload(Resource):
              - qed
              - vdi
              - vpc
+          - in: query
+            name: file_size
+            type: integer
+            required: false
+            description: The size of the image file, if {image_format} is not "raw"
        responses:
          200:
            description: OK
@@ -5131,7 +5141,10 @@ class API_Storage_Ceph_Volume_Element_Upload(Resource):
              id: Message
        """
        return api_helper.ceph_volume_upload(
-            pool, volume, reqargs.get("image_format", None)
+            pool,
+            volume,
+            reqargs.get("image_format", None),
+            reqargs.get("file_size", None),
        )


--- a/api-daemon/pvcapid/helper.py
+++ b/api-daemon/pvcapid/helper.py
@@ -1584,7 +1584,7 @@ def ceph_volume_remove(zkhandler, pool, name):


@ZKConnection(config)
-def ceph_volume_upload(zkhandler, pool, volume, img_type):
+def ceph_volume_upload(zkhandler, pool, volume, img_type, file_size=None):
    """
    Upload a raw file via HTTP post to a PVC Ceph volume
    """
@@ -1605,7 +1605,17 @@ def ceph_volume_upload(zkhandler, pool, volume, img_type):
        }
        retcode = 400
        return output, retcode
-    dev_size = retdata[0]["stats"]["size"]
+
+    try:
+        dev_size = retdata[0]["stats"]["size"]
+    except Exception:
+        output = {
+            "message": "Target volume '{}' does not exist in pool '{}'.".format(
+                volume, pool
+            )
+        }
+        retcode = 400
+        return output, retcode

    def cleanup_maps_and_volumes():
        # Unmap the target blockdev
@@ -1619,8 +1629,14 @@ def ceph_volume_upload(zkhandler, pool, volume, img_type):
            zkhandler, pool, "{}_tmp".format(volume)
        )

-    # Create a temporary block device to store non-raw images
    if img_type == "raw":
+        if file_size != dev_size:
+            output = {
+                "message": f"Image file size {file_size} does not match volume size {dev_size}"
+            }
+            retcode = 400
+            return output, retcode
+
        # Map the target blockdev
        retflag, retdata = pvc_ceph.map_volume(zkhandler, pool, volume)
        if not retflag:
@@ -1659,11 +1675,15 @@ def ceph_volume_upload(zkhandler, pool, volume, img_type):
        cleanup_maps_and_volumes()
        return output, retcode

-    # Write the image directly to the blockdev
    else:
+        if file_size is None:
+            output = {"message": "A file size must be specified"}
+            retcode = 400
+            return output, retcode
+
        # Create a temporary blockdev
        retflag, retdata = pvc_ceph.add_volume(
-            zkhandler, pool, "{}_tmp".format(volume), dev_size
+            zkhandler, pool, "{}_tmp".format(volume), file_size
        )
        if not retflag:
            output = {"message": retdata.replace('"', "'")}
--- a/client-cli/pvc/cli/cli.py
+++ b/client-cli/pvc/cli/cli.py
@@ -1188,10 +1188,8 @@ def cli_vm_modify(

    # Grab the current config
    current_vm_cfg_raw = vm_information.get("xml")
-    xml_data = etree.fromstring(current_vm_cfg_raw)
-    current_vm_cfgfile = (
-        etree.tostring(xml_data, pretty_print=True).decode("utf8").strip()
-    )
+    xml_data = fromstring(current_vm_cfg_raw)
+    current_vm_cfgfile = tostring(xml_data, pretty_print=True).decode("utf8").strip()

    if editor is True:
        new_vm_cfgfile = click.edit(
@@ -3600,7 +3598,7 @@ def cli_storage_volume_upload(pool, name, image_format, image_file):
    If the image format is "raw", the image is uploaded directly to the target volume without modification. Otherwise, it will be converted into raw format by "qemu-img convert" on the remote side before writing using a temporary volume. The image format must be a valid format recognized by "qemu-img", such as "vmdk" or "qcow2".
    """

-    if not os.path.exists(image_file):
+    if not path.exists(image_file):
        echo(CLI_CONFIG, "ERROR: File '{}' does not exist!".format(image_file))
        exit(1)

@@ -4912,13 +4910,13 @@ def cli_provisioner_ova_upload(name, filename, pool):
    Storage templates, provisioning scripts, and arguments for OVA-type profiles will be ignored and should not be set.
    """

-    if not os.path.exists(filename):
+    if not path.exists(filename):
        echo(CLI_CONFIG, "ERROR: File '{}' does not exist!".format(filename))
        exit(1)

    params = dict()
    params["pool"] = pool
-    params["ova_size"] = os.path.getsize(filename)
+    params["ova_size"] = path.getsize(filename)

    retcode, retdata = pvc.lib.provisioner.ova_upload(
        CLI_CONFIG, name, filename, params
--- a/client-cli/pvc/cli/formatters.py
+++ b/client-cli/pvc/cli/formatters.py
@@ -135,7 +135,7 @@ def cli_cluster_status_format_pretty(CLI_CONFIG, data):
            state_colour = ansii["green"]
        elif state in ["run,flush", "run,unflush", "run,flushed"]:
            state_colour = ansii["blue"]
-        elif "dead" in state or "stop" in state:
+        elif "dead" in state or "fenced" in state or "stop" in state:
            state_colour = ansii["red"]
        else:
            state_colour = ansii["yellow"]
--- a/client-cli/pvc/lib/storage.py
+++ b/client-cli/pvc/lib/storage.py
@@ -21,6 +21,7 @@

 import math

+from os import path
 from json import loads
 from requests_toolbelt.multipart.encoder import (
    MultipartEncoder,
@@ -1209,6 +1210,11 @@ def ceph_volume_upload(config, pool, volume, image_format, image_file):
    """
    import click

+    if image_format != "raw":
+        file_size = path.getsize(image_file)
+    else:
+        file_size = None
+
    bar = UploadProgressBar(
        image_file, end_message="Parsing file on remote side...", end_nl=False
    )
@@ -1220,7 +1226,7 @@ def ceph_volume_upload(config, pool, volume, image_format, image_file):
    upload_monitor = MultipartEncoderMonitor(upload_data, bar.update)

    headers = {"Content-Type": upload_monitor.content_type}
-    params = {"image_format": image_format}
+    params = {"image_format": image_format, "file_size": file_size}

    response = call_api(
        config,
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name="pvc",
-    version="0.9.73",
+    version="0.9.78",
    packages=["pvc.cli", "pvc.lib"],
    install_requires=[
        "Click",
--- a/daemon-common/ceph.py
+++ b/daemon-common/ceph.py
@@ -763,9 +763,7 @@ def add_volume(zkhandler, pool, name, size):

    # 2. Create the volume
    retcode, stdout, stderr = common.run_os_command(
-        "rbd create --size {} {}/{}".format(
-            format_bytes_tohuman(size_bytes), pool, name
-        )
+        "rbd create --size {}B {}/{}".format(size_bytes, pool, name)
    )
    if retcode:
        return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(name, stderr)
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@@ -256,8 +256,13 @@ def getClusterInformation(zkhandler):
        "stop,unflush",
        "dead,ready",
        "dead,flush",
+        "dead,fence-flush",
        "dead,flushed",
        "dead,unflush",
+        "fenced,ready",
+        "fenced,flush",
+        "fenced,flushed",
+        "fenced,unflush",
    ]
    vm_state_combinations = [
        "start",
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,40 @@
+pvc (0.9.78-0) unstable; urgency=high
+
+  * [API, Client CLI] Fixes several bugs around image uploads; adds a new query parameter for non-raw images
+  * [API] Ensures RBD images are created with a raw bytes value to avoid rounding errors
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sat, 30 Sep 2023 12:57:55 -0400
+
+pvc (0.9.77-0) unstable; urgency=high
+
+  * [Client CLI] Fixes a bug from a bad library import
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Tue, 19 Sep 2023 11:05:55 -0400
+
+pvc (0.9.76-0) unstable; urgency=high
+
+  * [API, Client CLI] Corrects some missing node states for fencing in status output
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Mon, 18 Sep 2023 10:15:52 -0400
+
+pvc (0.9.75-0) unstable; urgency=high
+
+  * [Node Daemon] Adds a startup message about IPMI when succeeding
+  * [Node Daemon] Fixes a bug in fencing allowing non-failing VMs to migrate
+  * [Node Daemon] Adds rounding to load average in load plugin for consistency
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sat, 16 Sep 2023 23:06:38 -0400
+
+pvc (0.9.74-0) unstable; urgency=high
+
+  * [Docs] Removes docs from the main repo
+  * [Client CLI] Ensures that "provision" VMs are shown in the right colour
+  * [Node Daemon] Separates the node monitoring subsystem into its own thread with a longer, customizable update interval
+  * [Node Daemon] Adds checks for PSU input power reundancy (psur) and hardware RAID (hwrd)
+  * [Node Daemon] Updates when Keepalive start messages are printed (end of run, with runtime) to align with new monitoring messages
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sat, 16 Sep 2023 00:18:13 -0400
+
 pvc (0.9.73-0) unstable; urgency=high

  * [Node Daemon] Fixes a bug creating monitoring instance
--- a/4
+++ b/4
@@ -14,7 +14,7 @@ sys.path.append('api-daemon')

 import pvcapid.flaskapi as pvc_api

-swagger_file = "docs/manuals/swagger.json"
+swagger_file = "swagger.json"
 swagger_data = swagger(pvc_api.app)
 swagger_data['info']['version'] = "1.0"
 swagger_data['info']['title'] = "PVC Client and Provisioner API"
@@ -22,3 +22,5 @@ swagger_data['host'] = "pvc.local:7370"

 with open(swagger_file, 'w') as fd:
    fd.write(json.dumps(swagger_data, sort_keys=True, indent=4))
+
+print(f"Swagger file output to {swagger_file}; add it to the PVC documentation repo.")
--- a/node-daemon/plugins/hwrd
+++ b/node-daemon/plugins/hwrd
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+
+# hwrd.py - PVC Monitoring example plugin for hardware RAID Arrays
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+# This script provides an example of a PVC monitoring plugin script. It will create
+# a simple plugin to check any hardwrae RAID virtual disks for health and report errors.
+# Supports Dell BOSS cards, LSI/Avago/Broadcom MegaRAID, and HP SmartArray RAID.
+
+# This script can thus be used as an example or reference implementation of a
+# PVC monitoring pluginscript and expanded upon as required.
+
+# A monitoring plugin script must implement the class "MonitoringPluginScript" which
+# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
+# of the role of each function is provided in context of the example; see the other
+# examples for more potential uses.
+
+# WARNING:
+#
+# This script will run in the context of the node daemon keepalives as root.
+# DO NOT install untrusted, unvetted plugins under any circumstances.
+
+
+# This import is always required here, as MonitoringPlugin is used by the
+# MonitoringPluginScript class
+from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
+
+
+# A monitoring plugin script must always expose its nice name, which must be identical to
+# the file name
+PLUGIN_NAME = "hwrd"
+
+
+# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
+class MonitoringPluginScript(MonitoringPlugin):
+    def check_dellboss(self):
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match
+
+        health_delta = 0
+        messages = list()
+
+        _dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
+        if _dellboss_ret != 0:
+            health_delta = 50
+            messages.append("Error running MVCLI command")
+        else:
+            arrays = list()
+            idx = None
+
+            for line in _dellboss_list.split('\n'):
+                if match(r"^id:", line):
+                    idx = int(line.split(":")[-1].strip())
+                    arrays.append(dict())
+                if match(r"^name:", line):
+                    arrays[idx]["name"] = line.split(":")[-1].strip()
+                if match(r"^status:", line):
+                    arrays[idx]["status"] = line.split(":")[-1].strip()
+
+            for idx, array in enumerate(arrays):
+                if array["status"] != "functional":
+                    health_delta += 50
+                messages.append(f"RAID Dell BOSS ID {idx} (Name: {array['name']}, State: {array['status']})")
+                    
+        if len(messages) < 1:
+            messages.append(f"No valid RAID arrays found")
+
+        return health_delta, messages
+
+    def check_megaraid(self):
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match
+
+        health_delta = 0
+        messages = list()
+
+        _megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
+        if _megaraid_ret != 0:
+            health_delta = 50
+            messages.append("Error running MegaCLI command")
+        else:
+            vd_list = _megaraid_list.split('\n\n\n')
+            for idx, _vd in enumerate(vd_list):
+                vd = _vd.split('\n')
+                if "Virtual Drive Information" not in vd[2]:
+                    continue
+
+                raid_name = None
+                raid_count = 0
+                raid_state = None
+
+                for entry in vd:
+                    if len(entry.split(':')) < 2:
+                        continue
+
+                    entry_key = entry.split(':')[0].strip()
+                    entry_value = entry.split(':')[1].strip()
+
+                    if entry_key == "State":
+                        raid_state = entry_value
+                    if entry_key == "Name":
+                        raid_name = entry_value
+                    if entry_key == "Number Of Drives":
+                        raid_count = entry_value
+
+                if raid_state is None or raid_name is None or raid_count == 0:
+                    health_delta += 10
+                    messages.append(f"RAID ID {idx} did not report useful values")
+                    continue
+
+                if raid_state != "Optimal":
+                    health_delta += 50
+                messages.append(f"RAID MegaRAID ID {idx} (Name: {raid_name}, Disks: {raid_count}, State: {raid_state})")
+                        
+        if len(messages) < 1:
+            messages.append(f"No valid RAID arrays found")
+
+        return health_delta, messages
+
+    def check_hpsa(self):
+        # Run any imports first
+        from daemon_lib.common import run_os_command
+        from re import match, findall
+
+        health_delta = 0
+        messages = list()
+
+        _hparray_ret, _hparray_list, _ = run_os_command(f"ssacli ctrl slot={self.controller_slot} ld all show")
+        if _hparray_ret != 0:
+            health_delta = 50
+            messages.append("Error running SSACLI command")
+        else:
+            vd_lines = _hparray_list.split('\n\n')
+
+            arrays = list()
+            cur_array = None
+            for idx, _line in enumerate(vd_lines):
+                line = _line.strip()
+                if match(r"^Array", line):
+                    cur_array = line
+                if match(r"^logicaldrive", line) and cur_array is not None:
+                    arrays.append(f"{cur_array} {line}")
+
+            for array in arrays:
+                if "OK" not in array:
+                    health_delta += 50
+                messages.append(f"RAID HPSA {array}")
+                        
+        if len(messages) < 1:
+            messages.append(f"No valid RAID arrays found")
+
+        return health_delta, messages
+
+    def setup(self):
+        """
+        setup(): Perform special setup steps during node daemon startup
+
+        This step is optional and should be used sparingly.
+
+        If you wish for the plugin to not load in certain conditions, do any checks here
+        and return a non-None failure message to indicate the error.
+        """
+
+        from daemon_lib.common import run_os_command
+        from re import match, findall
+
+        self.raid_type = list()
+
+        _dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
+        if _dellboss_ret == 0:
+            # If this returns 0 at all, there's a valid BOSS card to manage
+            self.raid_type.append("dellboss")
+
+        _megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
+        if _megaraid_ret == 0:
+            vd_list = _megaraid_list.split('\n\n\n')
+            for idx, _vd in enumerate(vd_list):
+                vd = _vd.split('\n')
+                if "Virtual Drive Information" in vd[2]:
+                    self.raid_type.append("megaraid")
+
+        _hpraid_ret, _hpraid_list, _ = run_os_command("ssacli ctrl all show status")
+        if _hpraid_ret == 0:
+            for line in _hpraid_list.split('\n'):
+                if match(r"^Smart", line):
+                    controller_slots = findall("Slot ([0-9])", line)
+                    if len(controller_slots) > 0:
+                        self.raid_type.append("hpsa")
+                        self.controller_slot = controller_slots[0]
+
+        if len(self.raid_type) < 1:
+            return "No hardware RAID management commands found"
+
+    def run(self, coordinator_state=None):
+        """
+        run(): Perform the check actions and return a PluginResult object
+        """
+
+        health_delta = 0
+        messages = list()
+
+        raid_function_map = {
+            "megaraid": self.check_megaraid,
+            "hpsa": self.check_hpsa,
+            "dellboss": self.check_dellboss,
+        }
+
+        for raid_type in self.raid_type:
+            _health_delta, _messages = raid_function_map.get(raid_type)()
+            health_delta += _health_delta
+            messages += _messages
+
+        # Set the health delta in our local PluginResult object
+        self.plugin_result.set_health_delta(health_delta)
+
+        # Set the message in our local PluginResult object
+        self.plugin_result.set_message(', '.join(messages))
+
+        # Return our local PluginResult object
+        return self.plugin_result
+
+    def cleanup(self):
+        """
+        cleanup(): Perform special cleanup steps during node daemon termination
+
+        This step is optional and should be used sparingly.
+        """
+
+        pass
--- a/node-daemon/plugins/load
+++ b/node-daemon/plugins/load
@@ -72,7 +72,7 @@ class MonitoringPluginScript(MonitoringPlugin):
        from psutil import cpu_count

        # Get the current 1-minute system load average
-        load_average = getloadavg()[0]
+        load_average = float(round(getloadavg()[0], 2))

        # Get the number of CPU cores
        cpu_cores = cpu_count()
--- a/node-daemon/plugins/psur
+++ b/node-daemon/plugins/psur
@@ -111,7 +111,7 @@ class MonitoringPluginScript(MonitoringPlugin):
                        messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
                    elif reading_text == "No Reading":
                        health_delta += 5
-                        messages.append("Input power sensor {reading_sensor} reports {reading_text}, redundant power not configured")
+                        messages.append(f"Input power sensor {reading_sensor} reports {reading_text} (PSU redundancy not configured?)")
                    else:
                        health_delta += 10
                        messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@@ -49,7 +49,7 @@ import re
 import json

 # Daemon version
-version = "0.9.73"
+version = "0.9.78"


 ##########################################################
@@ -324,9 +324,14 @@ def entrypoint():
        config["ipmi_hostname"], config["ipmi_username"], config["ipmi_password"]
    ):
        logger.out(
-            "Our IPMI is not reachable; fencing of this node will likely fail",
+            "Our IPMI interface is not reachable; fencing of this node will fail until corrected",
            state="w",
        )
+    else:
+        logger.out(
+            "Our IPMI interface is reachable; fencing of this node is possible",
+            state="o",
+        )

    # Validate libvirt
    if not pvcnoded.util.libvirt.validate_libvirtd(logger, config):
@@ -1024,14 +1029,14 @@ def entrypoint():
                        state="i",
                    )

-    # Set up the node monitoring instance
+    # Set up the node monitoring instance and thread
    monitoring_instance = MonitoringInstance.MonitoringInstance(
        zkhandler, config, logger, this_node
    )

    # Start keepalived thread
    keepalive_timer = pvcnoded.util.keepalive.start_keepalive_timer(
-        logger, config, zkhandler, this_node, monitoring_instance
+        logger, config, zkhandler, this_node
    )

    # Tick loop; does nothing since everything is async
--- a/node-daemon/pvcnoded/objects/MonitoringInstance.py
+++ b/node-daemon/pvcnoded/objects/MonitoringInstance.py
@@ -335,8 +335,8 @@ class MonitoringInstance(object):
                        )
                    )

-        self.start_check_timer()
        self.run_plugins()
+        self.start_check_timer()

    def __del__(self):
        self.shutdown()
--- a/node-daemon/pvcnoded/util/fencing.py
+++ b/node-daemon/pvcnoded/util/fencing.py
@@ -153,7 +153,13 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):

    # Loop through the VMs
    for dom_uuid in dead_node_running_domains:
-        fence_migrate_vm(dom_uuid)
+        try:
+            fence_migrate_vm(dom_uuid)
+        except Exception as e:
+            logger.out(
+                f"Failed to migrate VM {dom_uuid}, continuing: {e}",
+                state="w",
+            )

    # Set node in flushed state for easy remigrating when it comes back
    zkhandler.write([(("node.state.domain", node_name), "flushed")])
--- a/node-daemon/pvcnoded/util/keepalive.py
+++ b/node-daemon/pvcnoded/util/keepalive.py
@@ -51,7 +51,7 @@ libvirt_vm_states = {
 }


-def start_keepalive_timer(logger, config, zkhandler, this_node, monitoring_instance):
+def start_keepalive_timer(logger, config, zkhandler, this_node):
    keepalive_interval = config["keepalive_interval"]
    logger.out(
        f"Starting keepalive timer ({keepalive_interval} second interval)", state="s"
@@ -59,7 +59,7 @@ def start_keepalive_timer(logger, config, zkhandler, this_node, monitoring_insta
    keepalive_timer = BackgroundScheduler()
    keepalive_timer.add_job(
        node_keepalive,
-        args=(logger, config, zkhandler, this_node, monitoring_instance),
+        args=(logger, config, zkhandler, this_node),
        trigger="interval",
        seconds=keepalive_interval,
    )
@@ -674,7 +674,7 @@ def collect_vm_stats(logger, config, zkhandler, this_node, queue):


 # Keepalive update function
-def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
+def node_keepalive(logger, config, zkhandler, this_node):
    debug = config["debug"]

    # Display node information to the terminal
@@ -685,18 +685,10 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
            cst_colour = logger.fmt_blue
        else:
            cst_colour = logger.fmt_cyan
-        logger.out(
-            "{}{} keepalive @ {}{} [{}{}{}]".format(
-                logger.fmt_purple,
-                config["node_hostname"],
-                datetime.now(),
-                logger.fmt_end,
-                logger.fmt_bold + cst_colour,
-                this_node.coordinator_state,
-                logger.fmt_end,
-            ),
-            state="t",
-        )
+
+        active_coordinator_state = this_node.coordinator_state
+
+        runtime_start = datetime.now()

    # Set the migration selector in Zookeeper for clients to read
    if config["enable_hypervisor"]:
@@ -860,6 +852,23 @@ def node_keepalive(logger, config, zkhandler, this_node, monitoring_instance):
        logger.out("Failed to set keepalive data", state="e")

    if config["log_keepalives"]:
+        runtime_end = datetime.now()
+        runtime_delta = runtime_end - runtime_start
+        runtime = "{:0.02f}".format(runtime_delta.total_seconds())
+
+        logger.out(
+            "{start_colour}{hostname} keepalive @ {starttime}{nofmt} [{cst_colour}{costate}{nofmt}] in {runtime} seconds".format(
+                start_colour=logger.fmt_purple,
+                cst_colour=logger.fmt_bold + cst_colour,
+                nofmt=logger.fmt_end,
+                hostname=config["node_hostname"],
+                starttime=runtime_start,
+                costate=active_coordinator_state,
+                runtime=runtime,
+            ),
+            state="t",
+        )
+
        if this_node.maintenance is True:
            maintenance_colour = logger.fmt_blue
        else:
Author	SHA1	Message	Date
Joshua M. Boniface	c6c44bf775	Bump version to 0.9.78	2023-09-30 12:57:55 -04:00
Joshua M. Boniface	bbb940da65	Remove spurious comments	2023-09-30 12:37:58 -04:00
Joshua M. Boniface	a0b45a2bcd	Always create RBDs with bytes value Converting into human results in imprecise values when specifying bytes directly, which in turn breaks VMDK image uploads. Instead, just use the raw bytes value when creating the volume instead of converting it back.	2023-09-30 12:37:43 -04:00
Joshua M. Boniface	35e27f79ef	Fix uploading of non-raw image files Adds a new API query parameter to define the file size, which is then used for the temporary image. This is required for, at least VMDK, files to work properly in qemu-img convert.	2023-09-29 16:19:22 -04:00
Joshua M. Boniface	ad2e7750ff	Fix output path and print message	2023-09-21 02:32:53 -04:00
Joshua M. Boniface	7c0f12750e	Bump version to 0.9.77	2023-09-19 11:05:55 -04:00
Joshua M. Boniface	1c68e83d98	Fix bad refs to etree library	2023-09-19 11:05:19 -04:00
Joshua M. Boniface	51e78480fa	Bump version to 0.9.76	2023-09-18 10:15:52 -04:00
Joshua M. Boniface	c4397219da	Ensure fencing states are properly reflected	2023-09-18 09:59:18 -04:00
Joshua M. Boniface	f46bfc962f	Bump version to 0.9.75	2023-09-16 23:06:38 -04:00
Joshua M. Boniface	714d4b6005	Revert float conversion of cpu_cores Results in much uglier output, there are no decimal core counts.	2023-09-16 23:06:07 -04:00
Joshua M. Boniface	fa8329ac3d	Explicitly round load avg in load plugin	2023-09-16 22:58:49 -04:00
Joshua M. Boniface	457b7bed3d	Handle exceptions in fence migrations	2023-09-16 22:56:09 -04:00
Joshua M. Boniface	86115b2928	Add startup message for IPMI reachability It's good to know that this succeeded in addition to knowing if it failed.	2023-09-16 22:41:58 -04:00
Joshua M. Boniface	1a906b589e	Bump version to 0.9.74	2023-09-16 00:18:13 -04:00
Joshua M. Boniface	7b230d8bd5	Add monitoring plugin for hardware RAID arrays	2023-09-16 00:02:53 -04:00
Joshua M. Boniface	48662e90c1	Remove obsolete monitoring_instance passing	2023-09-15 22:47:45 -04:00
Joshua M. Boniface	079381c03e	Move printing to end and add runtime	2023-09-15 22:40:09 -04:00
Joshua M. Boniface	794cea4a02	Reverse ordering, run checks before starting timer	2023-09-15 22:25:37 -04:00
Joshua M. Boniface	fa24f3ba75	Fix bad fstring in psur check	2023-09-15 22:19:49 -04:00
@@ -1 +1 @@
 .9.73
 .9.78