Bump version to 0.9.25

Don't overwrite shutdown state on termination
Just a minor quibble and not really impactful.
2021-07-11 23:19:09 -04:00 · 2021-07-11 23:18:14 -04:00 · 2021-07-11 23:10:41 -04:00 · 2021-07-10 23:35:49 -04:00 · 2021-07-10 23:24:59 -04:00 · 2021-07-10 17:28:42 -04:00
11 changed files with 120 additions and 66 deletions
--- a/.version
+++ b/.version
@ -1 +1 @@
-0.9.24
+0.9.25
--- a/README.md
+++ b/README.md
@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r

 ## Changelog

+#### v0.9.25
+
+  * [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
+  * [Node Daemon] Adds a date output to keepalive messages
+  * [Daemons] Configures ZK connection logging only for persistent connections
+  * [API Provisioner] Add context manager-based chroot to Debootstrap example script
+  * [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
+
 #### v0.9.24

  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
--- a/api-daemon/provisioner/examples/debootstrap_script.py
+++ b/api-daemon/provisioner/examples/debootstrap_script.py
@ -34,6 +34,29 @@
 # with that.

 import os
+from contextlib import contextmanager
+
+
+# Create a chroot context manager
+# This can be used later in the script to chroot to the destination directory
+# for instance to run commands within the target.
+@contextmanager
+def chroot_target(destination):
+    try:
+        real_root = os.open("/", os.O_RDONLY)
+        os.chroot(destination)
+        fake_root = os.open("/", os.O_RDONLY)
+        os.fchdir(fake_root)
+        yield
+    finally:
+        os.fchdir(real_root)
+        os.chroot(".")
+        os.fchdir(real_root)
+        os.close(fake_root)
+        os.close(real_root)
+        del fake_root
+        del real_root
+

 # Installation function - performs a debootstrap install of a Debian system
 # Note that the only arguments are keyword arguments.
@ -193,13 +216,7 @@ GRUB_DISABLE_LINUX_UUID=false
        fh.write(data)

    # Chroot, do some in-root tasks, then exit the chroot
-    # EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
-    # WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
-    real_root = os.open("/", os.O_RDONLY)
-    os.chroot(temporary_directory)
-    fake_root = os.open("/", os.O_RDONLY)
-    os.fchdir(fake_root)
-
+    with chroot_target(temporary_directory):
        # Install and update GRUB
        os.system(
            "grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id'])
@ -219,15 +236,6 @@ GRUB_DISABLE_LINUX_UUID=false
            "systemctl enable cloud-init.target"
        )

-    # Restore our original root/exit the chroot
-    # EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
-    # WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
-    os.fchdir(real_root)
-    os.chroot(".")
-    os.fchdir(real_root)
-    os.close(fake_root)
-    os.close(real_root)
-
    # Unmount the bound devfs
    os.system(
        "umount {}/dev".format(
@ -235,8 +243,4 @@ GRUB_DISABLE_LINUX_UUID=false
        )
    )

-    # Clean up file handles so paths can be unmounted
-    del fake_root
-    del real_root
-
    # Everything else is done via cloud-init user-data
--- a/api-daemon/provisioner/examples/dummy_script.py
+++ b/api-daemon/provisioner/examples/dummy_script.py
@ -29,7 +29,7 @@
 # This script will run under root privileges as the provisioner does. Be careful
 # with that.

-# Installation function - performs a debootstrap install of a Debian system
+# Installation function - performs no actions then returns
 # Note that the only arguments are keyword arguments.
 def install(**kwargs):
    # The provisioner has already mounted the disks on kwargs['temporary_directory'].
--- a/api-daemon/pvcapid/Daemon.py
+++ b/api-daemon/pvcapid/Daemon.py
@ -25,7 +25,7 @@ import yaml
 from distutils.util import strtobool as dustrtobool

 # Daemon version
-version = '0.9.24'
+version = '0.9.25'

 # API version
 API_VERSION = 1.0
--- a/client-cli/setup.py
+++ b/client-cli/setup.py
@ -2,7 +2,7 @@ from setuptools import setup

 setup(
    name='pvc',
-    version='0.9.24',
+    version='0.9.25',
    packages=['pvc', 'pvc.cli_lib'],
    install_requires=[
        'Click',
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@ -140,13 +140,13 @@ class ZKHandler(object):
        """
        try:
            self.zk_conn.start()
-            self.log('Connection to Zookeeper started', state='o')
            if persistent:
+                self.log('Connection to Zookeeper started', state='o')
                self.zk_conn.add_listener(self.listener)
        except Exception as e:
            raise ZKConnectionException(self, e)

-    def disconnect(self):
+    def disconnect(self, persistent=False):
        """
        Stop and close the zk_conn object and disconnect from the cluster

@ -154,6 +154,7 @@ class ZKHandler(object):
        """
        self.zk_conn.stop()
        self.zk_conn.close()
+        if persistent:
            self.log('Connection to Zookeeper terminated', state='o')

    #
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,13 @@
+pvc (0.9.25-0) unstable; urgency=high
+
+  * [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
+  * [Node Daemon] Adds a date output to keepalive messages
+  * [Daemons] Configures ZK connection logging only for persistent connections
+  * [API Provisioner] Add context manager-based chroot to Debootstrap example script
+  * [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
+
+ -- Joshua M. Boniface <joshua@boniface.me>  Sun, 11 Jul 2021 23:19:09 -0400
+
 pvc (0.9.24-0) unstable; urgency=high

  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
--- a/debian/control
+++ b/debian/control
@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2

 Package: pvc-daemon-node
 Architecture: all
-Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
+Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
 Suggests: pvc-client-api, pvc-client-cli
 Description: Parallel Virtual Cluster node daemon (Python 3)
 A KVM/Zookeeper/Ceph-based VM and private cloud manager
--- a/docs/index.md
+++ b/docs/index.md
@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r

 ## Changelog

+#### v0.9.25
+
+  * [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
+  * [Node Daemon] Adds a date output to keepalive messages
+  * [Daemons] Configures ZK connection logging only for persistent connections
+  * [API Provisioner] Add context manager-based chroot to Debootstrap example script
+  * [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
+
 #### v0.9.24

  * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
--- a/node-daemon/pvcnoded/Daemon.py
+++ b/node-daemon/pvcnoded/Daemon.py
@ -32,12 +32,14 @@ import yaml
 import json

 from socket import gethostname
+from datetime import datetime
 from threading import Thread
 from ipaddress import ip_address, ip_network
 from apscheduler.schedulers.background import BackgroundScheduler
 from distutils.util import strtobool
 from queue import Queue
 from xml.etree import ElementTree
+from rados import Rados

 from daemon_lib.zkhandler import ZKHandler

@ -54,7 +56,7 @@ import pvcnoded.CephInstance as CephInstance
 import pvcnoded.MetadataAPIInstance as MetadataAPIInstance

 # Version string for startup output
-version = '0.9.24'
+version = '0.9.25'

 ###############################################################################
 # PVCD - node daemon startup program
@ -657,7 +659,7 @@ def update_schema(new_schema_version, stat, event=''):
    # Restart ourselves with the new schema
    logger.out('Reloading node daemon', state='s')
    try:
-        zkhandler.disconnect()
+        zkhandler.disconnect(persistent=True)
        del zkhandler
    except Exception:
        pass
@ -750,7 +752,7 @@ def cleanup():

    # Close the Zookeeper connection
    try:
-        zkhandler.disconnect()
+        zkhandler.disconnect(persistent=True)
        del zkhandler
    except Exception:
        pass
@ -1313,13 +1315,24 @@ def collect_ceph_stats(queue):
    if debug:
        logger.out("Thread starting", state='d', prefix='ceph-thread')

+    # Connect to the Ceph cluster
+    try:
+        ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
+        if debug:
+            logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
+        ceph_conn.connect(timeout=1)
+    except Exception as e:
+        logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
+        return
+
    if debug:
        logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')

    # Get Ceph cluster health for local status output
-    _, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
+    command = {"prefix": "health", "format": "json"}
    try:
-        ceph_health = json.loads(stdout)['status']
+        health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
+        ceph_health = health_status['status']
    except Exception as e:
        logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
        ceph_health = 'HEALTH_UNKN'
@ -1338,7 +1351,8 @@ def collect_ceph_stats(queue):
        if debug:
            logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')

-        _, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
+        command = {"prefix": "status", "format": "pretty"}
+        ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
        try:
            zkhandler.write([
                ('base.storage', str(ceph_status))
@ -1350,7 +1364,8 @@ def collect_ceph_stats(queue):
            logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')

        # Get rados df info
-        _, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
+        command = {"prefix": "df", "format": "pretty"}
+        ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
        try:
            zkhandler.write([
                ('base.storage.util', str(ceph_df))
@ -1362,14 +1377,15 @@ def collect_ceph_stats(queue):
            logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')

        # Get pool info
-        _, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
+        command = {"prefix": "df", "format": "json"}
+        ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
        try:
-            ceph_pool_df_raw = json.loads(stdout)['pools']
+            ceph_pool_df_raw = json.loads(ceph_df_output)['pools']
        except Exception as e:
            logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
            ceph_pool_df_raw = []

-        _, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
+        retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
        try:
            rados_pool_df_raw = json.loads(stdout)['pools']
        except Exception as e:
@ -1434,9 +1450,10 @@ def collect_ceph_stats(queue):
        # Parse the dump data
        osd_dump = dict()

-        _, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
+        command = {"prefix": "osd dump", "format": "json"}
+        osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
        try:
-            osd_dump_raw = json.loads(stdout)['osds']
+            osd_dump_raw = json.loads(osd_dump_output)['osds']
        except Exception as e:
            logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
            osd_dump_raw = []
@ -1459,9 +1476,9 @@ def collect_ceph_stats(queue):

        osd_df = dict()

-        _, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
+        command = {"prefix": "osd df", "format": "json"}
        try:
-            osd_df_raw = json.loads(osd_df_out)['nodes']
+            osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
        except Exception as e:
            logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
            osd_df_raw = []
@ -1486,10 +1503,12 @@ def collect_ceph_stats(queue):

        osd_status = dict()

-        retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
-        if retcode != 0:
-            logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
-            osd_status_raw = ''
+        command = {"prefix": "osd status", "format": "pretty"}
+        try:
+            osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
+        except Exception as e:
+            logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
+            osd_status_raw = []

        if debug:
            logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
@ -1556,6 +1575,8 @@ def collect_ceph_stats(queue):
                    # One or more of the status commands timed out, just continue
                    logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')

+    ceph_conn.shutdown()
+
    queue.put(ceph_health_colour)
    queue.put(ceph_health)
    queue.put(osds_this_node)
@ -1758,8 +1779,9 @@ def node_keepalive():
    # Get past state and update if needed
    if debug:
        logger.out("Get past state and update if needed", state='d', prefix='main-thread')
+
    past_state = zkhandler.read(('node.state.daemon', this_node.name))
-    if past_state != 'run':
+    if past_state != 'run' and past_state != 'shutdown':
        this_node.daemon_state = 'run'
        zkhandler.write([
            (('node.state.daemon', this_node.name), 'run')
@ -1858,9 +1880,10 @@ def node_keepalive():
        else:
            cst_colour = fmt_cyan
        logger.out(
-            '{}{} keepalive{} [{}{}{}]'.format(
+            '{}{} keepalive @ {}{} [{}{}{}]'.format(
                fmt_purple,
                myhostname,
+                datetime.now(),
                fmt_end,
                fmt_bold + cst_colour,
                this_node.router_state,
Author	SHA1	Message	Date
Joshua M. Boniface	2e9f6ac201	Bump version to 0.9.25	2021-07-11 23:19:09 -04:00
Joshua M. Boniface	f09849bedf	Don't overwrite shutdown state on termination Just a minor quibble and not really impactful.	2021-07-11 23:18:14 -04:00
Joshua M. Boniface	8c975e5c46	Add chroot context manager example to debootstrap Closes #132	2021-07-11 23:10:41 -04:00
Joshua M. Boniface	c76149141f	Only log ZK connections when persistent Prevents spam in the API logs.	2021-07-10 23:35:49 -04:00
Joshua M. Boniface	f00c4d07f4	Add date output to keepalive Helps track when there is a log follow in "-o cat" mode.	2021-07-10 23:24:59 -04:00
Joshua M. Boniface	20b66c10e1	Move two more commands to Rados library	2021-07-10 17:28:42 -04:00
Joshua M. Boniface	cfeba50b17	Revert "Return to all command-based Ceph gathering" This reverts commit `65d14ccd92`. This was actually a bad idea. For inexplicable reasons, running these Ceph commands manually (not even via Python, but in a normal shell) takes 7 * two orders of magnitude longer than running them with the Rados module, so long in fact that some basic commands like "ceph health" would sometimes take longer than the 1 second timeout to complete. The Rados commands would however take about 1ms instead. Despite the occasional issues when monitors drop out, the Rados module is clearly far superior to the shell commands for any moderately-loaded Ceph cluster. We can look into solving timeouts another way (perhaps with Processes instead of Threads) at a later time. Rados module "ceph health": b'{"checks":{},"status":"HEALTH_OK"}' 0.001204 (s) b'{"checks":{},"status":"HEALTH_OK"}' 0.001258 (s) Command "ceph health": joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null real 0m0.772s user 0m0.707s sys 0m0.046s joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null real 0m0.796s user 0m0.728s sys 0m0.054s	2021-07-10 03:47:45 -04:00
 @ -1 +1 @@
 .9.24
 .9.25