Compare commits

..

7 Commits

Author SHA1 Message Date
2e9f6ac201 Bump version to 0.9.25 2021-07-11 23:19:09 -04:00
f09849bedf Don't overwrite shutdown state on termination
Just a minor quibble and not really impactful.
2021-07-11 23:18:14 -04:00
8c975e5c46 Add chroot context manager example to debootstrap
Closes #132
2021-07-11 23:10:41 -04:00
c76149141f Only log ZK connections when persistent
Prevents spam in the API logs.
2021-07-10 23:35:49 -04:00
f00c4d07f4 Add date output to keepalive
Helps track when there is a log follow in "-o cat" mode.
2021-07-10 23:24:59 -04:00
20b66c10e1 Move two more commands to Rados library 2021-07-10 17:28:42 -04:00
cfeba50b17 Revert "Return to all command-based Ceph gathering"
This reverts commit 65d14ccd92.

This was actually a bad idea. For inexplicable reasons, running these
Ceph commands manually (not even via Python, but in a normal shell)
takes 7 * two orders of magnitude longer than running them with the
Rados module, so long in fact that some basic commands like "ceph
health" would sometimes take longer than the 1 second timeout to
complete. The Rados commands would however take about 1ms instead.

Despite the occasional issues when monitors drop out, the Rados module
is clearly far superior to the shell commands for any moderately-loaded
Ceph cluster. We can look into solving timeouts another way (perhaps
with Processes instead of Threads) at a later time.

Rados module "ceph health":
    b'{"checks":{},"status":"HEALTH_OK"}'
    0.001204 (s)
    b'{"checks":{},"status":"HEALTH_OK"}'
    0.001258 (s)
Command "ceph health":
    joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null
    real    0m0.772s
    user    0m0.707s
    sys     0m0.046s
    joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null
    real    0m0.796s
    user    0m0.728s
    sys     0m0.054s
2021-07-10 03:47:45 -04:00
11 changed files with 120 additions and 66 deletions

View File

@ -1 +1 @@
0.9.24
0.9.25

View File

@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
## Changelog
#### v0.9.25
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
* [Node Daemon] Adds a date output to keepalive messages
* [Daemons] Configures ZK connection logging only for persistent connections
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
#### v0.9.24
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements

View File

@ -34,6 +34,29 @@
# with that.
import os
from contextlib import contextmanager
# Create a chroot context manager
# This can be used later in the script to chroot to the destination directory
# for instance to run commands within the target.
@contextmanager
def chroot_target(destination):
try:
real_root = os.open("/", os.O_RDONLY)
os.chroot(destination)
fake_root = os.open("/", os.O_RDONLY)
os.fchdir(fake_root)
yield
finally:
os.fchdir(real_root)
os.chroot(".")
os.fchdir(real_root)
os.close(fake_root)
os.close(real_root)
del fake_root
del real_root
# Installation function - performs a debootstrap install of a Debian system
# Note that the only arguments are keyword arguments.
@ -193,13 +216,7 @@ GRUB_DISABLE_LINUX_UUID=false
fh.write(data)
# Chroot, do some in-root tasks, then exit the chroot
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
real_root = os.open("/", os.O_RDONLY)
os.chroot(temporary_directory)
fake_root = os.open("/", os.O_RDONLY)
os.fchdir(fake_root)
with chroot_target(temporary_directory):
# Install and update GRUB
os.system(
"grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id'])
@ -219,15 +236,6 @@ GRUB_DISABLE_LINUX_UUID=false
"systemctl enable cloud-init.target"
)
# Restore our original root/exit the chroot
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
os.fchdir(real_root)
os.chroot(".")
os.fchdir(real_root)
os.close(fake_root)
os.close(real_root)
# Unmount the bound devfs
os.system(
"umount {}/dev".format(
@ -235,8 +243,4 @@ GRUB_DISABLE_LINUX_UUID=false
)
)
# Clean up file handles so paths can be unmounted
del fake_root
del real_root
# Everything else is done via cloud-init user-data

View File

@ -29,7 +29,7 @@
# This script will run under root privileges as the provisioner does. Be careful
# with that.
# Installation function - performs a debootstrap install of a Debian system
# Installation function - performs no actions then returns
# Note that the only arguments are keyword arguments.
def install(**kwargs):
# The provisioner has already mounted the disks on kwargs['temporary_directory'].

View File

@ -25,7 +25,7 @@ import yaml
from distutils.util import strtobool as dustrtobool
# Daemon version
version = '0.9.24'
version = '0.9.25'
# API version
API_VERSION = 1.0

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name='pvc',
version='0.9.24',
version='0.9.25',
packages=['pvc', 'pvc.cli_lib'],
install_requires=[
'Click',

View File

@ -140,13 +140,13 @@ class ZKHandler(object):
"""
try:
self.zk_conn.start()
self.log('Connection to Zookeeper started', state='o')
if persistent:
self.log('Connection to Zookeeper started', state='o')
self.zk_conn.add_listener(self.listener)
except Exception as e:
raise ZKConnectionException(self, e)
def disconnect(self):
def disconnect(self, persistent=False):
"""
Stop and close the zk_conn object and disconnect from the cluster
@ -154,6 +154,7 @@ class ZKHandler(object):
"""
self.zk_conn.stop()
self.zk_conn.close()
if persistent:
self.log('Connection to Zookeeper terminated', state='o')
#

10
debian/changelog vendored
View File

@ -1,3 +1,13 @@
pvc (0.9.25-0) unstable; urgency=high
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
* [Node Daemon] Adds a date output to keepalive messages
* [Daemons] Configures ZK connection logging only for persistent connections
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
-- Joshua M. Boniface <joshua@boniface.me> Sun, 11 Jul 2021 23:19:09 -0400
pvc (0.9.24-0) unstable; urgency=high
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements

2
debian/control vendored
View File

@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
Package: pvc-daemon-node
Architecture: all
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
Suggests: pvc-client-api, pvc-client-cli
Description: Parallel Virtual Cluster node daemon (Python 3)
A KVM/Zookeeper/Ceph-based VM and private cloud manager

View File

@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
## Changelog
#### v0.9.25
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
* [Node Daemon] Adds a date output to keepalive messages
* [Daemons] Configures ZK connection logging only for persistent connections
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
#### v0.9.24
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements

View File

@ -32,12 +32,14 @@ import yaml
import json
from socket import gethostname
from datetime import datetime
from threading import Thread
from ipaddress import ip_address, ip_network
from apscheduler.schedulers.background import BackgroundScheduler
from distutils.util import strtobool
from queue import Queue
from xml.etree import ElementTree
from rados import Rados
from daemon_lib.zkhandler import ZKHandler
@ -54,7 +56,7 @@ import pvcnoded.CephInstance as CephInstance
import pvcnoded.MetadataAPIInstance as MetadataAPIInstance
# Version string for startup output
version = '0.9.24'
version = '0.9.25'
###############################################################################
# PVCD - node daemon startup program
@ -657,7 +659,7 @@ def update_schema(new_schema_version, stat, event=''):
# Restart ourselves with the new schema
logger.out('Reloading node daemon', state='s')
try:
zkhandler.disconnect()
zkhandler.disconnect(persistent=True)
del zkhandler
except Exception:
pass
@ -750,7 +752,7 @@ def cleanup():
# Close the Zookeeper connection
try:
zkhandler.disconnect()
zkhandler.disconnect(persistent=True)
del zkhandler
except Exception:
pass
@ -1313,13 +1315,24 @@ def collect_ceph_stats(queue):
if debug:
logger.out("Thread starting", state='d', prefix='ceph-thread')
# Connect to the Ceph cluster
try:
ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
if debug:
logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
ceph_conn.connect(timeout=1)
except Exception as e:
logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
return
if debug:
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
# Get Ceph cluster health for local status output
_, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
command = {"prefix": "health", "format": "json"}
try:
ceph_health = json.loads(stdout)['status']
health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
ceph_health = health_status['status']
except Exception as e:
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
ceph_health = 'HEALTH_UNKN'
@ -1338,7 +1351,8 @@ def collect_ceph_stats(queue):
if debug:
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
_, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
command = {"prefix": "status", "format": "pretty"}
ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try:
zkhandler.write([
('base.storage', str(ceph_status))
@ -1350,7 +1364,8 @@ def collect_ceph_stats(queue):
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get rados df info
_, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
command = {"prefix": "df", "format": "pretty"}
ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try:
zkhandler.write([
('base.storage.util', str(ceph_df))
@ -1362,14 +1377,15 @@ def collect_ceph_stats(queue):
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get pool info
_, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
command = {"prefix": "df", "format": "json"}
ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try:
ceph_pool_df_raw = json.loads(stdout)['pools']
ceph_pool_df_raw = json.loads(ceph_df_output)['pools']
except Exception as e:
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
ceph_pool_df_raw = []
_, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
try:
rados_pool_df_raw = json.loads(stdout)['pools']
except Exception as e:
@ -1434,9 +1450,10 @@ def collect_ceph_stats(queue):
# Parse the dump data
osd_dump = dict()
_, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
command = {"prefix": "osd dump", "format": "json"}
osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try:
osd_dump_raw = json.loads(stdout)['osds']
osd_dump_raw = json.loads(osd_dump_output)['osds']
except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
osd_dump_raw = []
@ -1459,9 +1476,9 @@ def collect_ceph_stats(queue):
osd_df = dict()
_, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
command = {"prefix": "osd df", "format": "json"}
try:
osd_df_raw = json.loads(osd_df_out)['nodes']
osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
osd_df_raw = []
@ -1486,10 +1503,12 @@ def collect_ceph_stats(queue):
osd_status = dict()
retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
if retcode != 0:
logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
osd_status_raw = ''
command = {"prefix": "osd status", "format": "pretty"}
try:
osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
except Exception as e:
logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
osd_status_raw = []
if debug:
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
@ -1556,6 +1575,8 @@ def collect_ceph_stats(queue):
# One or more of the status commands timed out, just continue
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
ceph_conn.shutdown()
queue.put(ceph_health_colour)
queue.put(ceph_health)
queue.put(osds_this_node)
@ -1758,8 +1779,9 @@ def node_keepalive():
# Get past state and update if needed
if debug:
logger.out("Get past state and update if needed", state='d', prefix='main-thread')
past_state = zkhandler.read(('node.state.daemon', this_node.name))
if past_state != 'run':
if past_state != 'run' and past_state != 'shutdown':
this_node.daemon_state = 'run'
zkhandler.write([
(('node.state.daemon', this_node.name), 'run')
@ -1858,9 +1880,10 @@ def node_keepalive():
else:
cst_colour = fmt_cyan
logger.out(
'{}{} keepalive{} [{}{}{}]'.format(
'{}{} keepalive @ {}{} [{}{}{}]'.format(
fmt_purple,
myhostname,
datetime.now(),
fmt_end,
fmt_bold + cst_colour,
this_node.router_state,