Compare commits

..

7 Commits

Author SHA1 Message Date
2e9f6ac201 Bump version to 0.9.25 2021-07-11 23:19:09 -04:00
f09849bedf Don't overwrite shutdown state on termination
Just a minor quibble and not really impactful.
2021-07-11 23:18:14 -04:00
8c975e5c46 Add chroot context manager example to debootstrap
Closes #132
2021-07-11 23:10:41 -04:00
c76149141f Only log ZK connections when persistent
Prevents spam in the API logs.
2021-07-10 23:35:49 -04:00
f00c4d07f4 Add date output to keepalive
Helps track when there is a log follow in "-o cat" mode.
2021-07-10 23:24:59 -04:00
20b66c10e1 Move two more commands to Rados library 2021-07-10 17:28:42 -04:00
cfeba50b17 Revert "Return to all command-based Ceph gathering"
This reverts commit 65d14ccd92.

This was actually a bad idea. For inexplicable reasons, running these
Ceph commands manually (not even via Python, but in a normal shell)
takes 7 * two orders of magnitude longer than running them with the
Rados module, so long in fact that some basic commands like "ceph
health" would sometimes take longer than the 1 second timeout to
complete. The Rados commands would however take about 1ms instead.

Despite the occasional issues when monitors drop out, the Rados module
is clearly far superior to the shell commands for any moderately-loaded
Ceph cluster. We can look into solving timeouts another way (perhaps
with Processes instead of Threads) at a later time.

Rados module "ceph health":
    b'{"checks":{},"status":"HEALTH_OK"}'
    0.001204 (s)
    b'{"checks":{},"status":"HEALTH_OK"}'
    0.001258 (s)
Command "ceph health":
    joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null
    real    0m0.772s
    user    0m0.707s
    sys     0m0.046s
    joshua@hv1.c.bonilan.net ~ $ time ceph health >/dev/null
    real    0m0.796s
    user    0m0.728s
    sys     0m0.054s
2021-07-10 03:47:45 -04:00
11 changed files with 120 additions and 66 deletions

View File

@ -1 +1 @@
0.9.24 0.9.25

View File

@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
## Changelog ## Changelog
#### v0.9.25
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
* [Node Daemon] Adds a date output to keepalive messages
* [Daemons] Configures ZK connection logging only for persistent connections
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
#### v0.9.24 #### v0.9.24
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements

View File

@ -34,6 +34,29 @@
# with that. # with that.
import os import os
from contextlib import contextmanager
# Create a chroot context manager
# This can be used later in the script to chroot to the destination directory
# for instance to run commands within the target.
@contextmanager
def chroot_target(destination):
try:
real_root = os.open("/", os.O_RDONLY)
os.chroot(destination)
fake_root = os.open("/", os.O_RDONLY)
os.fchdir(fake_root)
yield
finally:
os.fchdir(real_root)
os.chroot(".")
os.fchdir(real_root)
os.close(fake_root)
os.close(real_root)
del fake_root
del real_root
# Installation function - performs a debootstrap install of a Debian system # Installation function - performs a debootstrap install of a Debian system
# Note that the only arguments are keyword arguments. # Note that the only arguments are keyword arguments.
@ -193,13 +216,7 @@ GRUB_DISABLE_LINUX_UUID=false
fh.write(data) fh.write(data)
# Chroot, do some in-root tasks, then exit the chroot # Chroot, do some in-root tasks, then exit the chroot
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER with chroot_target(temporary_directory):
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
real_root = os.open("/", os.O_RDONLY)
os.chroot(temporary_directory)
fake_root = os.open("/", os.O_RDONLY)
os.fchdir(fake_root)
# Install and update GRUB # Install and update GRUB
os.system( os.system(
"grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id']) "grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id'])
@ -219,15 +236,6 @@ GRUB_DISABLE_LINUX_UUID=false
"systemctl enable cloud-init.target" "systemctl enable cloud-init.target"
) )
# Restore our original root/exit the chroot
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
os.fchdir(real_root)
os.chroot(".")
os.fchdir(real_root)
os.close(fake_root)
os.close(real_root)
# Unmount the bound devfs # Unmount the bound devfs
os.system( os.system(
"umount {}/dev".format( "umount {}/dev".format(
@ -235,8 +243,4 @@ GRUB_DISABLE_LINUX_UUID=false
) )
) )
# Clean up file handles so paths can be unmounted
del fake_root
del real_root
# Everything else is done via cloud-init user-data # Everything else is done via cloud-init user-data

View File

@ -29,7 +29,7 @@
# This script will run under root privileges as the provisioner does. Be careful # This script will run under root privileges as the provisioner does. Be careful
# with that. # with that.
# Installation function - performs a debootstrap install of a Debian system # Installation function - performs no actions then returns
# Note that the only arguments are keyword arguments. # Note that the only arguments are keyword arguments.
def install(**kwargs): def install(**kwargs):
# The provisioner has already mounted the disks on kwargs['temporary_directory']. # The provisioner has already mounted the disks on kwargs['temporary_directory'].

View File

@ -25,7 +25,7 @@ import yaml
from distutils.util import strtobool as dustrtobool from distutils.util import strtobool as dustrtobool
# Daemon version # Daemon version
version = '0.9.24' version = '0.9.25'
# API version # API version
API_VERSION = 1.0 API_VERSION = 1.0

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup( setup(
name='pvc', name='pvc',
version='0.9.24', version='0.9.25',
packages=['pvc', 'pvc.cli_lib'], packages=['pvc', 'pvc.cli_lib'],
install_requires=[ install_requires=[
'Click', 'Click',

View File

@ -140,13 +140,13 @@ class ZKHandler(object):
""" """
try: try:
self.zk_conn.start() self.zk_conn.start()
self.log('Connection to Zookeeper started', state='o')
if persistent: if persistent:
self.log('Connection to Zookeeper started', state='o')
self.zk_conn.add_listener(self.listener) self.zk_conn.add_listener(self.listener)
except Exception as e: except Exception as e:
raise ZKConnectionException(self, e) raise ZKConnectionException(self, e)
def disconnect(self): def disconnect(self, persistent=False):
""" """
Stop and close the zk_conn object and disconnect from the cluster Stop and close the zk_conn object and disconnect from the cluster
@ -154,6 +154,7 @@ class ZKHandler(object):
""" """
self.zk_conn.stop() self.zk_conn.stop()
self.zk_conn.close() self.zk_conn.close()
if persistent:
self.log('Connection to Zookeeper terminated', state='o') self.log('Connection to Zookeeper terminated', state='o')
# #

10
debian/changelog vendored
View File

@ -1,3 +1,13 @@
pvc (0.9.25-0) unstable; urgency=high
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
* [Node Daemon] Adds a date output to keepalive messages
* [Daemons] Configures ZK connection logging only for persistent connections
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
-- Joshua M. Boniface <joshua@boniface.me> Sun, 11 Jul 2021 23:19:09 -0400
pvc (0.9.24-0) unstable; urgency=high pvc (0.9.24-0) unstable; urgency=high
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements

2
debian/control vendored
View File

@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
Package: pvc-daemon-node Package: pvc-daemon-node
Architecture: all Architecture: all
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
Suggests: pvc-client-api, pvc-client-cli Suggests: pvc-client-api, pvc-client-cli
Description: Parallel Virtual Cluster node daemon (Python 3) Description: Parallel Virtual Cluster node daemon (Python 3)
A KVM/Zookeeper/Ceph-based VM and private cloud manager A KVM/Zookeeper/Ceph-based VM and private cloud manager

View File

@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
## Changelog ## Changelog
#### v0.9.25
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
* [Node Daemon] Adds a date output to keepalive messages
* [Daemons] Configures ZK connection logging only for persistent connections
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
#### v0.9.24 #### v0.9.24
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements * [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements

View File

@ -32,12 +32,14 @@ import yaml
import json import json
from socket import gethostname from socket import gethostname
from datetime import datetime
from threading import Thread from threading import Thread
from ipaddress import ip_address, ip_network from ipaddress import ip_address, ip_network
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from distutils.util import strtobool from distutils.util import strtobool
from queue import Queue from queue import Queue
from xml.etree import ElementTree from xml.etree import ElementTree
from rados import Rados
from daemon_lib.zkhandler import ZKHandler from daemon_lib.zkhandler import ZKHandler
@ -54,7 +56,7 @@ import pvcnoded.CephInstance as CephInstance
import pvcnoded.MetadataAPIInstance as MetadataAPIInstance import pvcnoded.MetadataAPIInstance as MetadataAPIInstance
# Version string for startup output # Version string for startup output
version = '0.9.24' version = '0.9.25'
############################################################################### ###############################################################################
# PVCD - node daemon startup program # PVCD - node daemon startup program
@ -657,7 +659,7 @@ def update_schema(new_schema_version, stat, event=''):
# Restart ourselves with the new schema # Restart ourselves with the new schema
logger.out('Reloading node daemon', state='s') logger.out('Reloading node daemon', state='s')
try: try:
zkhandler.disconnect() zkhandler.disconnect(persistent=True)
del zkhandler del zkhandler
except Exception: except Exception:
pass pass
@ -750,7 +752,7 @@ def cleanup():
# Close the Zookeeper connection # Close the Zookeeper connection
try: try:
zkhandler.disconnect() zkhandler.disconnect(persistent=True)
del zkhandler del zkhandler
except Exception: except Exception:
pass pass
@ -1313,13 +1315,24 @@ def collect_ceph_stats(queue):
if debug: if debug:
logger.out("Thread starting", state='d', prefix='ceph-thread') logger.out("Thread starting", state='d', prefix='ceph-thread')
# Connect to the Ceph cluster
try:
ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
if debug:
logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
ceph_conn.connect(timeout=1)
except Exception as e:
logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
return
if debug: if debug:
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread') logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
# Get Ceph cluster health for local status output # Get Ceph cluster health for local status output
_, stdout, _ = common.run_os_command('ceph health --format json', timeout=1) command = {"prefix": "health", "format": "json"}
try: try:
ceph_health = json.loads(stdout)['status'] health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
ceph_health = health_status['status']
except Exception as e: except Exception as e:
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e') logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
ceph_health = 'HEALTH_UNKN' ceph_health = 'HEALTH_UNKN'
@ -1338,7 +1351,8 @@ def collect_ceph_stats(queue):
if debug: if debug:
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread') logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
_, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1) command = {"prefix": "status", "format": "pretty"}
ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try: try:
zkhandler.write([ zkhandler.write([
('base.storage', str(ceph_status)) ('base.storage', str(ceph_status))
@ -1350,7 +1364,8 @@ def collect_ceph_stats(queue):
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread') logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get rados df info # Get rados df info
_, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1) command = {"prefix": "df", "format": "pretty"}
ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try: try:
zkhandler.write([ zkhandler.write([
('base.storage.util', str(ceph_df)) ('base.storage.util', str(ceph_df))
@ -1362,14 +1377,15 @@ def collect_ceph_stats(queue):
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread') logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
# Get pool info # Get pool info
_, stdout, _ = common.run_os_command('ceph df --format json', timeout=1) command = {"prefix": "df", "format": "json"}
ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try: try:
ceph_pool_df_raw = json.loads(stdout)['pools'] ceph_pool_df_raw = json.loads(ceph_df_output)['pools']
except Exception as e: except Exception as e:
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w') logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
ceph_pool_df_raw = [] ceph_pool_df_raw = []
_, stdout, _ = common.run_os_command('rados df --format json', timeout=1) retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
try: try:
rados_pool_df_raw = json.loads(stdout)['pools'] rados_pool_df_raw = json.loads(stdout)['pools']
except Exception as e: except Exception as e:
@ -1434,9 +1450,10 @@ def collect_ceph_stats(queue):
# Parse the dump data # Parse the dump data
osd_dump = dict() osd_dump = dict()
_, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1) command = {"prefix": "osd dump", "format": "json"}
osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
try: try:
osd_dump_raw = json.loads(stdout)['osds'] osd_dump_raw = json.loads(osd_dump_output)['osds']
except Exception as e: except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w') logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
osd_dump_raw = [] osd_dump_raw = []
@ -1459,9 +1476,9 @@ def collect_ceph_stats(queue):
osd_df = dict() osd_df = dict()
_, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1) command = {"prefix": "osd df", "format": "json"}
try: try:
osd_df_raw = json.loads(osd_df_out)['nodes'] osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
except Exception as e: except Exception as e:
logger.out('Failed to obtain OSD data: {}'.format(e), state='w') logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
osd_df_raw = [] osd_df_raw = []
@ -1486,10 +1503,12 @@ def collect_ceph_stats(queue):
osd_status = dict() osd_status = dict()
retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1) command = {"prefix": "osd status", "format": "pretty"}
if retcode != 0: try:
logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w') osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
osd_status_raw = '' except Exception as e:
logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
osd_status_raw = []
if debug: if debug:
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread') logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
@ -1556,6 +1575,8 @@ def collect_ceph_stats(queue):
# One or more of the status commands timed out, just continue # One or more of the status commands timed out, just continue
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w') logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
ceph_conn.shutdown()
queue.put(ceph_health_colour) queue.put(ceph_health_colour)
queue.put(ceph_health) queue.put(ceph_health)
queue.put(osds_this_node) queue.put(osds_this_node)
@ -1758,8 +1779,9 @@ def node_keepalive():
# Get past state and update if needed # Get past state and update if needed
if debug: if debug:
logger.out("Get past state and update if needed", state='d', prefix='main-thread') logger.out("Get past state and update if needed", state='d', prefix='main-thread')
past_state = zkhandler.read(('node.state.daemon', this_node.name)) past_state = zkhandler.read(('node.state.daemon', this_node.name))
if past_state != 'run': if past_state != 'run' and past_state != 'shutdown':
this_node.daemon_state = 'run' this_node.daemon_state = 'run'
zkhandler.write([ zkhandler.write([
(('node.state.daemon', this_node.name), 'run') (('node.state.daemon', this_node.name), 'run')
@ -1858,9 +1880,10 @@ def node_keepalive():
else: else:
cst_colour = fmt_cyan cst_colour = fmt_cyan
logger.out( logger.out(
'{}{} keepalive{} [{}{}{}]'.format( '{}{} keepalive @ {}{} [{}{}{}]'.format(
fmt_purple, fmt_purple,
myhostname, myhostname,
datetime.now(),
fmt_end, fmt_end,
fmt_bold + cst_colour, fmt_bold + cst_colour,
this_node.router_state, this_node.router_state,