Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
2e9f6ac201 | |||
f09849bedf | |||
8c975e5c46 | |||
c76149141f | |||
f00c4d07f4 | |||
20b66c10e1 | |||
cfeba50b17 |
@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
|
||||
|
||||
## Changelog
|
||||
|
||||
#### v0.9.25
|
||||
|
||||
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
|
||||
* [Node Daemon] Adds a date output to keepalive messages
|
||||
* [Daemons] Configures ZK connection logging only for persistent connections
|
||||
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
|
||||
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
|
||||
|
||||
#### v0.9.24
|
||||
|
||||
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
|
||||
|
@ -34,6 +34,29 @@
|
||||
# with that.
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
# Create a chroot context manager
|
||||
# This can be used later in the script to chroot to the destination directory
|
||||
# for instance to run commands within the target.
|
||||
@contextmanager
|
||||
def chroot_target(destination):
|
||||
try:
|
||||
real_root = os.open("/", os.O_RDONLY)
|
||||
os.chroot(destination)
|
||||
fake_root = os.open("/", os.O_RDONLY)
|
||||
os.fchdir(fake_root)
|
||||
yield
|
||||
finally:
|
||||
os.fchdir(real_root)
|
||||
os.chroot(".")
|
||||
os.fchdir(real_root)
|
||||
os.close(fake_root)
|
||||
os.close(real_root)
|
||||
del fake_root
|
||||
del real_root
|
||||
|
||||
|
||||
# Installation function - performs a debootstrap install of a Debian system
|
||||
# Note that the only arguments are keyword arguments.
|
||||
@ -193,40 +216,25 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
fh.write(data)
|
||||
|
||||
# Chroot, do some in-root tasks, then exit the chroot
|
||||
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
|
||||
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
|
||||
real_root = os.open("/", os.O_RDONLY)
|
||||
os.chroot(temporary_directory)
|
||||
fake_root = os.open("/", os.O_RDONLY)
|
||||
os.fchdir(fake_root)
|
||||
|
||||
# Install and update GRUB
|
||||
os.system(
|
||||
"grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id'])
|
||||
)
|
||||
os.system(
|
||||
"update-grub"
|
||||
)
|
||||
# Set a really dumb root password [TEMPORARY]
|
||||
os.system(
|
||||
"echo root:test123 | chpasswd"
|
||||
)
|
||||
# Enable cloud-init target on (first) boot
|
||||
# NOTE: Your user-data should handle this and disable it once done, or things get messy.
|
||||
# That cloud-init won't run without this hack seems like a bug... but even the official
|
||||
# Debian cloud images are affected, so who knows.
|
||||
os.system(
|
||||
"systemctl enable cloud-init.target"
|
||||
)
|
||||
|
||||
# Restore our original root/exit the chroot
|
||||
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
|
||||
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
|
||||
os.fchdir(real_root)
|
||||
os.chroot(".")
|
||||
os.fchdir(real_root)
|
||||
os.close(fake_root)
|
||||
os.close(real_root)
|
||||
with chroot_target(temporary_directory):
|
||||
# Install and update GRUB
|
||||
os.system(
|
||||
"grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id'])
|
||||
)
|
||||
os.system(
|
||||
"update-grub"
|
||||
)
|
||||
# Set a really dumb root password [TEMPORARY]
|
||||
os.system(
|
||||
"echo root:test123 | chpasswd"
|
||||
)
|
||||
# Enable cloud-init target on (first) boot
|
||||
# NOTE: Your user-data should handle this and disable it once done, or things get messy.
|
||||
# That cloud-init won't run without this hack seems like a bug... but even the official
|
||||
# Debian cloud images are affected, so who knows.
|
||||
os.system(
|
||||
"systemctl enable cloud-init.target"
|
||||
)
|
||||
|
||||
# Unmount the bound devfs
|
||||
os.system(
|
||||
@ -235,8 +243,4 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
)
|
||||
)
|
||||
|
||||
# Clean up file handles so paths can be unmounted
|
||||
del fake_root
|
||||
del real_root
|
||||
|
||||
# Everything else is done via cloud-init user-data
|
||||
|
@ -29,7 +29,7 @@
|
||||
# This script will run under root privileges as the provisioner does. Be careful
|
||||
# with that.
|
||||
|
||||
# Installation function - performs a debootstrap install of a Debian system
|
||||
# Installation function - performs no actions then returns
|
||||
# Note that the only arguments are keyword arguments.
|
||||
def install(**kwargs):
|
||||
# The provisioner has already mounted the disks on kwargs['temporary_directory'].
|
||||
|
@ -25,7 +25,7 @@ import yaml
|
||||
from distutils.util import strtobool as dustrtobool
|
||||
|
||||
# Daemon version
|
||||
version = '0.9.24'
|
||||
version = '0.9.25'
|
||||
|
||||
# API version
|
||||
API_VERSION = 1.0
|
||||
|
@ -2,7 +2,7 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name='pvc',
|
||||
version='0.9.24',
|
||||
version='0.9.25',
|
||||
packages=['pvc', 'pvc.cli_lib'],
|
||||
install_requires=[
|
||||
'Click',
|
||||
|
@ -140,13 +140,13 @@ class ZKHandler(object):
|
||||
"""
|
||||
try:
|
||||
self.zk_conn.start()
|
||||
self.log('Connection to Zookeeper started', state='o')
|
||||
if persistent:
|
||||
self.log('Connection to Zookeeper started', state='o')
|
||||
self.zk_conn.add_listener(self.listener)
|
||||
except Exception as e:
|
||||
raise ZKConnectionException(self, e)
|
||||
|
||||
def disconnect(self):
|
||||
def disconnect(self, persistent=False):
|
||||
"""
|
||||
Stop and close the zk_conn object and disconnect from the cluster
|
||||
|
||||
@ -154,7 +154,8 @@ class ZKHandler(object):
|
||||
"""
|
||||
self.zk_conn.stop()
|
||||
self.zk_conn.close()
|
||||
self.log('Connection to Zookeeper terminated', state='o')
|
||||
if persistent:
|
||||
self.log('Connection to Zookeeper terminated', state='o')
|
||||
|
||||
#
|
||||
# Schema helper actions
|
||||
|
10
debian/changelog
vendored
10
debian/changelog
vendored
@ -1,3 +1,13 @@
|
||||
pvc (0.9.25-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
|
||||
* [Node Daemon] Adds a date output to keepalive messages
|
||||
* [Daemons] Configures ZK connection logging only for persistent connections
|
||||
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
|
||||
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sun, 11 Jul 2021 23:19:09 -0400
|
||||
|
||||
pvc (0.9.24-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
|
||||
|
2
debian/control
vendored
2
debian/control
vendored
@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2
|
||||
|
||||
Package: pvc-daemon-node
|
||||
Architecture: all
|
||||
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
||||
Depends: systemd, pvc-daemon-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, python3-rados, python3-gevent, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql
|
||||
Suggests: pvc-client-api, pvc-client-cli
|
||||
Description: Parallel Virtual Cluster node daemon (Python 3)
|
||||
A KVM/Zookeeper/Ceph-based VM and private cloud manager
|
||||
|
@ -42,6 +42,14 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
|
||||
|
||||
## Changelog
|
||||
|
||||
#### v0.9.25
|
||||
|
||||
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
|
||||
* [Node Daemon] Adds a date output to keepalive messages
|
||||
* [Daemons] Configures ZK connection logging only for persistent connections
|
||||
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
|
||||
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
|
||||
|
||||
#### v0.9.24
|
||||
|
||||
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
|
||||
|
@ -32,12 +32,14 @@ import yaml
|
||||
import json
|
||||
|
||||
from socket import gethostname
|
||||
from datetime import datetime
|
||||
from threading import Thread
|
||||
from ipaddress import ip_address, ip_network
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from distutils.util import strtobool
|
||||
from queue import Queue
|
||||
from xml.etree import ElementTree
|
||||
from rados import Rados
|
||||
|
||||
from daemon_lib.zkhandler import ZKHandler
|
||||
|
||||
@ -54,7 +56,7 @@ import pvcnoded.CephInstance as CephInstance
|
||||
import pvcnoded.MetadataAPIInstance as MetadataAPIInstance
|
||||
|
||||
# Version string for startup output
|
||||
version = '0.9.24'
|
||||
version = '0.9.25'
|
||||
|
||||
###############################################################################
|
||||
# PVCD - node daemon startup program
|
||||
@ -657,7 +659,7 @@ def update_schema(new_schema_version, stat, event=''):
|
||||
# Restart ourselves with the new schema
|
||||
logger.out('Reloading node daemon', state='s')
|
||||
try:
|
||||
zkhandler.disconnect()
|
||||
zkhandler.disconnect(persistent=True)
|
||||
del zkhandler
|
||||
except Exception:
|
||||
pass
|
||||
@ -750,7 +752,7 @@ def cleanup():
|
||||
|
||||
# Close the Zookeeper connection
|
||||
try:
|
||||
zkhandler.disconnect()
|
||||
zkhandler.disconnect(persistent=True)
|
||||
del zkhandler
|
||||
except Exception:
|
||||
pass
|
||||
@ -1313,13 +1315,24 @@ def collect_ceph_stats(queue):
|
||||
if debug:
|
||||
logger.out("Thread starting", state='d', prefix='ceph-thread')
|
||||
|
||||
# Connect to the Ceph cluster
|
||||
try:
|
||||
ceph_conn = Rados(conffile=config['ceph_config_file'], conf=dict(keyring=config['ceph_admin_keyring']))
|
||||
if debug:
|
||||
logger.out("Connecting to cluster", state='d', prefix='ceph-thread')
|
||||
ceph_conn.connect(timeout=1)
|
||||
except Exception as e:
|
||||
logger.out('Failed to open connection to Ceph cluster: {}'.format(e), state='e')
|
||||
return
|
||||
|
||||
if debug:
|
||||
logger.out("Getting health stats from monitor", state='d', prefix='ceph-thread')
|
||||
|
||||
# Get Ceph cluster health for local status output
|
||||
_, stdout, _ = common.run_os_command('ceph health --format json', timeout=1)
|
||||
command = {"prefix": "health", "format": "json"}
|
||||
try:
|
||||
ceph_health = json.loads(stdout)['status']
|
||||
health_status = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])
|
||||
ceph_health = health_status['status']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
|
||||
ceph_health = 'HEALTH_UNKN'
|
||||
@ -1338,7 +1351,8 @@ def collect_ceph_stats(queue):
|
||||
if debug:
|
||||
logger.out("Set ceph health information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
|
||||
_, ceph_status, _ = common.run_os_command('ceph status --format plain', timeout=1)
|
||||
command = {"prefix": "status", "format": "pretty"}
|
||||
ceph_status = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
zkhandler.write([
|
||||
('base.storage', str(ceph_status))
|
||||
@ -1350,7 +1364,8 @@ def collect_ceph_stats(queue):
|
||||
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
|
||||
# Get rados df info
|
||||
_, ceph_df, _ = common.run_os_command('ceph df --format plain', timeout=1)
|
||||
command = {"prefix": "df", "format": "pretty"}
|
||||
ceph_df = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
zkhandler.write([
|
||||
('base.storage.util', str(ceph_df))
|
||||
@ -1362,14 +1377,15 @@ def collect_ceph_stats(queue):
|
||||
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
|
||||
# Get pool info
|
||||
_, stdout, _ = common.run_os_command('ceph df --format json', timeout=1)
|
||||
command = {"prefix": "df", "format": "json"}
|
||||
ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
ceph_pool_df_raw = json.loads(stdout)['pools']
|
||||
ceph_pool_df_raw = json.loads(ceph_df_output)['pools']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
|
||||
ceph_pool_df_raw = []
|
||||
|
||||
_, stdout, _ = common.run_os_command('rados df --format json', timeout=1)
|
||||
retcode, stdout, stderr = common.run_os_command('rados df --format json', timeout=1)
|
||||
try:
|
||||
rados_pool_df_raw = json.loads(stdout)['pools']
|
||||
except Exception as e:
|
||||
@ -1434,9 +1450,10 @@ def collect_ceph_stats(queue):
|
||||
# Parse the dump data
|
||||
osd_dump = dict()
|
||||
|
||||
_, stdout, _ = common.run_os_command('ceph osd dump --format json --connect-timeout 1', timeout=1)
|
||||
command = {"prefix": "osd dump", "format": "json"}
|
||||
osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
osd_dump_raw = json.loads(stdout)['osds']
|
||||
osd_dump_raw = json.loads(osd_dump_output)['osds']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||
osd_dump_raw = []
|
||||
@ -1459,9 +1476,9 @@ def collect_ceph_stats(queue):
|
||||
|
||||
osd_df = dict()
|
||||
|
||||
_, osd_df_out, _ = common.run_os_command('ceph osd df --format json', timeout=1)
|
||||
command = {"prefix": "osd df", "format": "json"}
|
||||
try:
|
||||
osd_df_raw = json.loads(osd_df_out)['nodes']
|
||||
osd_df_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['nodes']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||
osd_df_raw = []
|
||||
@ -1486,10 +1503,12 @@ def collect_ceph_stats(queue):
|
||||
|
||||
osd_status = dict()
|
||||
|
||||
retcode, osd_status_raw, stderr = common.run_os_command('ceph osd status --format plain', timeout=1)
|
||||
if retcode != 0:
|
||||
logger.out('Failed to obtain OSD status data: {}'.format(stderr), state='w')
|
||||
osd_status_raw = ''
|
||||
command = {"prefix": "osd status", "format": "pretty"}
|
||||
try:
|
||||
osd_status_raw = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain OSD status data: {}'.format(e), state='w')
|
||||
osd_status_raw = []
|
||||
|
||||
if debug:
|
||||
logger.out("Loop through OSD status data", state='d', prefix='ceph-thread')
|
||||
@ -1556,6 +1575,8 @@ def collect_ceph_stats(queue):
|
||||
# One or more of the status commands timed out, just continue
|
||||
logger.out('Failed to upload OSD stats from dictionary: {}'.format(e), state='w')
|
||||
|
||||
ceph_conn.shutdown()
|
||||
|
||||
queue.put(ceph_health_colour)
|
||||
queue.put(ceph_health)
|
||||
queue.put(osds_this_node)
|
||||
@ -1758,8 +1779,9 @@ def node_keepalive():
|
||||
# Get past state and update if needed
|
||||
if debug:
|
||||
logger.out("Get past state and update if needed", state='d', prefix='main-thread')
|
||||
|
||||
past_state = zkhandler.read(('node.state.daemon', this_node.name))
|
||||
if past_state != 'run':
|
||||
if past_state != 'run' and past_state != 'shutdown':
|
||||
this_node.daemon_state = 'run'
|
||||
zkhandler.write([
|
||||
(('node.state.daemon', this_node.name), 'run')
|
||||
@ -1858,9 +1880,10 @@ def node_keepalive():
|
||||
else:
|
||||
cst_colour = fmt_cyan
|
||||
logger.out(
|
||||
'{}{} keepalive{} [{}{}{}]'.format(
|
||||
'{}{} keepalive @ {}{} [{}{}{}]'.format(
|
||||
fmt_purple,
|
||||
myhostname,
|
||||
datetime.now(),
|
||||
fmt_end,
|
||||
fmt_bold + cst_colour,
|
||||
this_node.router_state,
|
||||
|
Reference in New Issue
Block a user