Compare commits
21 Commits
Author | SHA1 | Date | |
---|---|---|---|
2e9f6ac201 | |||
f09849bedf | |||
8c975e5c46 | |||
c76149141f | |||
f00c4d07f4 | |||
20b66c10e1 | |||
cfeba50b17 | |||
0699c48d10 | |||
551bae2518 | |||
4832245d9c | |||
2138f2f59f | |||
d1d355a96b | |||
2b5dc286ab | |||
c0c9327a7d | |||
5ffabcfef5 | |||
330cf14638 | |||
9d0eb20197 | |||
3f5b7045a2 | |||
80fe96b24d | |||
80f04ce8ee | |||
65d14ccd92 |
16
README.md
16
README.md
@ -42,6 +42,22 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
|
||||
|
||||
## Changelog
|
||||
|
||||
#### v0.9.25
|
||||
|
||||
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
|
||||
* [Node Daemon] Adds a date output to keepalive messages
|
||||
* [Daemons] Configures ZK connection logging only for persistent connections
|
||||
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
|
||||
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
|
||||
|
||||
#### v0.9.24
|
||||
|
||||
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
|
||||
* [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
|
||||
* [CLI Client] Allow raw lists of clusters from `pvc cluster list`
|
||||
* [API Daemon] Fixes several issues when getting VM data without stats
|
||||
* [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
|
||||
|
||||
#### v0.9.23
|
||||
|
||||
* [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
|
||||
|
@ -34,6 +34,29 @@
|
||||
# with that.
|
||||
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
|
||||
# Create a chroot context manager
|
||||
# This can be used later in the script to chroot to the destination directory
|
||||
# for instance to run commands within the target.
|
||||
@contextmanager
|
||||
def chroot_target(destination):
|
||||
try:
|
||||
real_root = os.open("/", os.O_RDONLY)
|
||||
os.chroot(destination)
|
||||
fake_root = os.open("/", os.O_RDONLY)
|
||||
os.fchdir(fake_root)
|
||||
yield
|
||||
finally:
|
||||
os.fchdir(real_root)
|
||||
os.chroot(".")
|
||||
os.fchdir(real_root)
|
||||
os.close(fake_root)
|
||||
os.close(real_root)
|
||||
del fake_root
|
||||
del real_root
|
||||
|
||||
|
||||
# Installation function - performs a debootstrap install of a Debian system
|
||||
# Note that the only arguments are keyword arguments.
|
||||
@ -193,13 +216,7 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
fh.write(data)
|
||||
|
||||
# Chroot, do some in-root tasks, then exit the chroot
|
||||
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
|
||||
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
|
||||
real_root = os.open("/", os.O_RDONLY)
|
||||
os.chroot(temporary_directory)
|
||||
fake_root = os.open("/", os.O_RDONLY)
|
||||
os.fchdir(fake_root)
|
||||
|
||||
with chroot_target(temporary_directory):
|
||||
# Install and update GRUB
|
||||
os.system(
|
||||
"grub-install --force /dev/rbd/{}/{}_{}".format(root_disk['pool'], vm_name, root_disk['disk_id'])
|
||||
@ -219,15 +236,6 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
"systemctl enable cloud-init.target"
|
||||
)
|
||||
|
||||
# Restore our original root/exit the chroot
|
||||
# EXITING THE CHROOT IS VERY IMPORTANT OR THE FOLLOWING STAGES OF THE PROVISIONER
|
||||
# WILL FAIL IN UNEXPECTED WAYS! Keep this in mind when using chroot in your scripts.
|
||||
os.fchdir(real_root)
|
||||
os.chroot(".")
|
||||
os.fchdir(real_root)
|
||||
os.close(fake_root)
|
||||
os.close(real_root)
|
||||
|
||||
# Unmount the bound devfs
|
||||
os.system(
|
||||
"umount {}/dev".format(
|
||||
@ -235,8 +243,4 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
)
|
||||
)
|
||||
|
||||
# Clean up file handles so paths can be unmounted
|
||||
del fake_root
|
||||
del real_root
|
||||
|
||||
# Everything else is done via cloud-init user-data
|
||||
|
@ -29,7 +29,7 @@
|
||||
# This script will run under root privileges as the provisioner does. Be careful
|
||||
# with that.
|
||||
|
||||
# Installation function - performs a debootstrap install of a Debian system
|
||||
# Installation function - performs no actions then returns
|
||||
# Note that the only arguments are keyword arguments.
|
||||
def install(**kwargs):
|
||||
# The provisioner has already mounted the disks on kwargs['temporary_directory'].
|
||||
|
@ -25,7 +25,7 @@ import yaml
|
||||
from distutils.util import strtobool as dustrtobool
|
||||
|
||||
# Daemon version
|
||||
version = '0.9.23'
|
||||
version = '0.9.25'
|
||||
|
||||
# API version
|
||||
API_VERSION = 1.0
|
||||
|
@ -251,7 +251,11 @@ def cluster_remove(name):
|
||||
# pvc cluster list
|
||||
###############################################################################
|
||||
@click.command(name='list', short_help='List all available clusters.')
|
||||
def cluster_list():
|
||||
@click.option(
|
||||
'-r', '--raw', 'raw', is_flag=True, default=False,
|
||||
help='Display the raw list of cluster names only.'
|
||||
)
|
||||
def cluster_list(raw):
|
||||
"""
|
||||
List all the available PVC clusters configured in this CLI instance.
|
||||
"""
|
||||
@ -302,6 +306,7 @@ def cluster_list():
|
||||
if _api_key_length > api_key_length:
|
||||
api_key_length = _api_key_length
|
||||
|
||||
if not raw:
|
||||
# Display the data nicely
|
||||
click.echo("Available clusters:")
|
||||
click.echo()
|
||||
@ -341,6 +346,7 @@ def cluster_list():
|
||||
if not api_key:
|
||||
api_key = 'N/A'
|
||||
|
||||
if not raw:
|
||||
click.echo(
|
||||
'{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}'.format(
|
||||
bold='',
|
||||
@ -359,6 +365,8 @@ def cluster_list():
|
||||
api_key_length=api_key_length
|
||||
)
|
||||
)
|
||||
else:
|
||||
click.echo(cluster)
|
||||
|
||||
|
||||
# Validate that the cluster is set for a given command
|
||||
@ -1652,7 +1660,7 @@ def vm_dump(filename, domain):
|
||||
@cluster_req
|
||||
def vm_list(target_node, target_state, limit, raw):
|
||||
"""
|
||||
List all virtual machines; optionally only match names matching regex LIMIT.
|
||||
List all virtual machines; optionally only match names or full UUIDs matching regex LIMIT.
|
||||
|
||||
NOTE: Red-coloured network lists indicate one or more configured networks are missing/invalid.
|
||||
"""
|
||||
|
@ -2,7 +2,7 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name='pvc',
|
||||
version='0.9.23',
|
||||
version='0.9.25',
|
||||
packages=['pvc', 'pvc.cli_lib'],
|
||||
install_requires=[
|
||||
'Click',
|
||||
|
@ -343,8 +343,13 @@ def getInformationFromXML(zkhandler, uuid):
|
||||
|
||||
parsed_xml = getDomainXML(zkhandler, uuid)
|
||||
|
||||
stats_data = loads(zkhandler.read(('domain.stats', uuid)))
|
||||
if stats_data is None:
|
||||
stats_data = zkhandler.read(('domain.stats', uuid))
|
||||
if stats_data is not None:
|
||||
try:
|
||||
stats_data = loads(stats_data)
|
||||
except Exception:
|
||||
stats_data = {}
|
||||
else:
|
||||
stats_data = {}
|
||||
|
||||
domain_uuid, domain_name, domain_description, domain_memory, domain_vcpu, domain_vcputopo = getDomainMainDetails(parsed_xml)
|
||||
|
@ -449,14 +449,6 @@ def remove_vm(zkhandler, domain):
|
||||
if current_vm_state != 'stop':
|
||||
change_state(zkhandler, dom_uuid, 'stop')
|
||||
|
||||
# Gracefully terminate the class instances
|
||||
change_state(zkhandler, dom_uuid, 'delete')
|
||||
|
||||
# Delete the configurations
|
||||
zkhandler.delete([
|
||||
('domain', dom_uuid)
|
||||
])
|
||||
|
||||
# Wait for 1 second to allow state to flow to all nodes
|
||||
time.sleep(1)
|
||||
|
||||
@ -465,11 +457,28 @@ def remove_vm(zkhandler, domain):
|
||||
# vmpool/vmname_volume
|
||||
try:
|
||||
disk_pool, disk_name = disk.split('/')
|
||||
retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return True, 'Removed VM "{}" and disks from the cluster.'.format(domain)
|
||||
retcode, message = ceph.remove_volume(zkhandler, disk_pool, disk_name)
|
||||
if not retcode:
|
||||
if re.match('^ERROR: No volume with name', message):
|
||||
continue
|
||||
else:
|
||||
return False, message
|
||||
|
||||
# Gracefully terminate the class instances
|
||||
change_state(zkhandler, dom_uuid, 'delete')
|
||||
|
||||
# Wait for 1/2 second to allow state to flow to all nodes
|
||||
time.sleep(0.5)
|
||||
|
||||
# Delete the VM configuration from Zookeeper
|
||||
zkhandler.delete([
|
||||
('domain', dom_uuid)
|
||||
])
|
||||
|
||||
return True, 'Removed VM "{}" and its disks from the cluster.'.format(domain)
|
||||
|
||||
|
||||
def start_vm(zkhandler, domain):
|
||||
@ -789,7 +798,10 @@ def get_console_log(zkhandler, domain, lines=1000):
|
||||
return False, 'ERROR: Could not find VM "{}" in the cluster!'.format(domain)
|
||||
|
||||
# Get the data from ZK
|
||||
console_log = zkhandler.read(('domain.log.console', dom_uuid))
|
||||
console_log = zkhandler.read(('domain.console.log', dom_uuid))
|
||||
|
||||
if console_log is None:
|
||||
return True, ''
|
||||
|
||||
# Shrink the log buffer to length lines
|
||||
shrunk_log = console_log.split('\n')[-lines:]
|
||||
@ -897,6 +909,9 @@ def get_list(zkhandler, node, state, limit, is_fuzzy=True):
|
||||
for vm_uuid in vm_execute_list:
|
||||
futures.append(executor.submit(common.getInformationFromXML, zkhandler, vm_uuid))
|
||||
for future in futures:
|
||||
try:
|
||||
vm_data_list.append(future.result())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return True, vm_data_list
|
||||
|
@ -124,37 +124,29 @@ class ZKHandler(object):
|
||||
# State/connection management
|
||||
#
|
||||
def listener(self, state):
|
||||
"""
|
||||
Listen for KazooState changes and log accordingly.
|
||||
|
||||
This function does not do anything except for log the state, and Kazoo handles the rest.
|
||||
"""
|
||||
if state == KazooState.CONNECTED:
|
||||
self.log('Connection to Zookeeper started', state='o')
|
||||
self.log('Connection to Zookeeper resumed', state='o')
|
||||
else:
|
||||
self.log('Connection to Zookeeper lost', state='w')
|
||||
|
||||
while True:
|
||||
time.sleep(0.5)
|
||||
|
||||
_zk_conn = KazooClient(hosts=self.coordinators)
|
||||
try:
|
||||
_zk_conn.start()
|
||||
except Exception:
|
||||
del _zk_conn
|
||||
continue
|
||||
|
||||
self.zk_conn = _zk_conn
|
||||
self.zk_conn.add_listener(self.listener)
|
||||
break
|
||||
self.log('Connection to Zookeeper lost with state {}'.format(state), state='w')
|
||||
|
||||
def connect(self, persistent=False):
|
||||
"""
|
||||
Start the zk_conn object and connect to the cluster, then load the current schema version
|
||||
Start the zk_conn object and connect to the cluster
|
||||
"""
|
||||
try:
|
||||
self.zk_conn.start()
|
||||
if persistent:
|
||||
self.log('Connection to Zookeeper started', state='o')
|
||||
self.zk_conn.add_listener(self.listener)
|
||||
except Exception as e:
|
||||
raise ZKConnectionException(self, e)
|
||||
|
||||
def disconnect(self):
|
||||
def disconnect(self, persistent=False):
|
||||
"""
|
||||
Stop and close the zk_conn object and disconnect from the cluster
|
||||
|
||||
@ -162,11 +154,27 @@ class ZKHandler(object):
|
||||
"""
|
||||
self.zk_conn.stop()
|
||||
self.zk_conn.close()
|
||||
if persistent:
|
||||
self.log('Connection to Zookeeper terminated', state='o')
|
||||
|
||||
#
|
||||
# Schema helper actions
|
||||
#
|
||||
def get_schema_path(self, key):
|
||||
"""
|
||||
Get the Zookeeper path for {key} from the current schema based on its format.
|
||||
|
||||
If {key} is a tuple of length 2, it's treated as a path plus an item instance of that path (e.g. a node, a VM, etc.).
|
||||
|
||||
If {key} is a tuple of length 4, it is treated as a path plus an item instance, as well as another item instance of the subpath.
|
||||
|
||||
If {key} is just a string, it's treated as a lone path (mostly used for the 'base' schema group.
|
||||
|
||||
Otherwise, returns None since this is not a valid key.
|
||||
|
||||
This function also handles the special case where a string that looks like an existing path (i.e. starts with '/') is passed;
|
||||
in that case it will silently return the same path back. This was mostly a migration functionality and is deprecated.
|
||||
"""
|
||||
if isinstance(key, tuple):
|
||||
# This is a key tuple with both an ipath and an item
|
||||
if len(key) == 2:
|
||||
|
20
debian/changelog
vendored
20
debian/changelog
vendored
@ -1,3 +1,23 @@
|
||||
pvc (0.9.25-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
|
||||
* [Node Daemon] Adds a date output to keepalive messages
|
||||
* [Daemons] Configures ZK connection logging only for persistent connections
|
||||
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
|
||||
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sun, 11 Jul 2021 23:19:09 -0400
|
||||
|
||||
pvc (0.9.24-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
|
||||
* [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
|
||||
* [CLI Client] Allow raw lists of clusters from `pvc cluster list`
|
||||
* [API Daemon] Fixes several issues when getting VM data without stats
|
||||
* [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Fri, 09 Jul 2021 15:58:36 -0400
|
||||
|
||||
pvc (0.9.23-0) unstable; urgency=high
|
||||
|
||||
* [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
|
||||
|
@ -42,6 +42,22 @@ To get started with PVC, please see the [About](https://parallelvirtualcluster.r
|
||||
|
||||
## Changelog
|
||||
|
||||
#### v0.9.25
|
||||
|
||||
* [Node Daemon] Returns to Rados library calls for Ceph due to performance problems
|
||||
* [Node Daemon] Adds a date output to keepalive messages
|
||||
* [Daemons] Configures ZK connection logging only for persistent connections
|
||||
* [API Provisioner] Add context manager-based chroot to Debootstrap example script
|
||||
* [Node Daemon] Fixes a bug where shutdown daemon state was overwritten
|
||||
|
||||
#### v0.9.24
|
||||
|
||||
* [Node Daemon] Removes Rados module polling of Ceph cluster and returns to command-based polling for timeout purposes, and removes some flaky return statements
|
||||
* [Node Daemon] Removes flaky Zookeeper connection renewals that caused problems
|
||||
* [CLI Client] Allow raw lists of clusters from `pvc cluster list`
|
||||
* [API Daemon] Fixes several issues when getting VM data without stats
|
||||
* [API Daemon] Fixes issues with removing VMs while disks are still in use (failed provisioning, etc.)
|
||||
|
||||
#### v0.9.23
|
||||
|
||||
* [Daemons] Fixes a critical overwriting bug in zkhandler when schema paths are not yet valid
|
||||
|
@ -32,6 +32,7 @@ import yaml
|
||||
import json
|
||||
|
||||
from socket import gethostname
|
||||
from datetime import datetime
|
||||
from threading import Thread
|
||||
from ipaddress import ip_address, ip_network
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
@ -55,7 +56,7 @@ import pvcnoded.CephInstance as CephInstance
|
||||
import pvcnoded.MetadataAPIInstance as MetadataAPIInstance
|
||||
|
||||
# Version string for startup output
|
||||
version = '0.9.23'
|
||||
version = '0.9.25'
|
||||
|
||||
###############################################################################
|
||||
# PVCD - node daemon startup program
|
||||
@ -658,7 +659,7 @@ def update_schema(new_schema_version, stat, event=''):
|
||||
# Restart ourselves with the new schema
|
||||
logger.out('Reloading node daemon', state='s')
|
||||
try:
|
||||
zkhandler.disconnect()
|
||||
zkhandler.disconnect(persistent=True)
|
||||
del zkhandler
|
||||
except Exception:
|
||||
pass
|
||||
@ -751,7 +752,7 @@ def cleanup():
|
||||
|
||||
# Close the Zookeeper connection
|
||||
try:
|
||||
zkhandler.disconnect()
|
||||
zkhandler.disconnect(persistent=True)
|
||||
del zkhandler
|
||||
except Exception:
|
||||
pass
|
||||
@ -1334,11 +1335,13 @@ def collect_ceph_stats(queue):
|
||||
ceph_health = health_status['status']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain Ceph health data: {}'.format(e), state='e')
|
||||
return
|
||||
ceph_health = 'HEALTH_UNKN'
|
||||
|
||||
if ceph_health == 'HEALTH_OK':
|
||||
if ceph_health in ['HEALTH_OK']:
|
||||
ceph_health_colour = fmt_green
|
||||
elif ceph_health == 'HEALTH_WARN':
|
||||
elif ceph_health in ['HEALTH_UNKN']:
|
||||
ceph_health_colour = fmt_cyan
|
||||
elif ceph_health in ['HEALTH_WARN']:
|
||||
ceph_health_colour = fmt_yellow
|
||||
else:
|
||||
ceph_health_colour = fmt_red
|
||||
@ -1356,7 +1359,6 @@ def collect_ceph_stats(queue):
|
||||
])
|
||||
except Exception as e:
|
||||
logger.out('Failed to set Ceph status data: {}'.format(e), state='e')
|
||||
return
|
||||
|
||||
if debug:
|
||||
logger.out("Set ceph rados df information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
@ -1370,15 +1372,15 @@ def collect_ceph_stats(queue):
|
||||
])
|
||||
except Exception as e:
|
||||
logger.out('Failed to set Ceph utilization data: {}'.format(e), state='e')
|
||||
return
|
||||
|
||||
if debug:
|
||||
logger.out("Set pool information in zookeeper (primary only)", state='d', prefix='ceph-thread')
|
||||
|
||||
# Get pool info
|
||||
retcode, stdout, stderr = common.run_os_command('ceph df --format json', timeout=1)
|
||||
command = {"prefix": "df", "format": "json"}
|
||||
ceph_df_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
ceph_pool_df_raw = json.loads(stdout)['pools']
|
||||
ceph_pool_df_raw = json.loads(ceph_df_output)['pools']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain Pool data (ceph df): {}'.format(e), state='w')
|
||||
ceph_pool_df_raw = []
|
||||
@ -1449,9 +1451,9 @@ def collect_ceph_stats(queue):
|
||||
osd_dump = dict()
|
||||
|
||||
command = {"prefix": "osd dump", "format": "json"}
|
||||
osd_dump_output = ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1].decode('ascii')
|
||||
try:
|
||||
retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2)
|
||||
osd_dump_raw = json.loads(stdout)['osds']
|
||||
osd_dump_raw = json.loads(osd_dump_output)['osds']
|
||||
except Exception as e:
|
||||
logger.out('Failed to obtain OSD data: {}'.format(e), state='w')
|
||||
osd_dump_raw = []
|
||||
@ -1608,7 +1610,6 @@ def collect_vm_stats(queue):
|
||||
lv_conn = libvirt.open(libvirt_name)
|
||||
if lv_conn is None:
|
||||
logger.out('Failed to open connection to "{}"'.format(libvirt_name), state='e')
|
||||
return
|
||||
|
||||
memalloc = 0
|
||||
memprov = 0
|
||||
@ -1778,8 +1779,9 @@ def node_keepalive():
|
||||
# Get past state and update if needed
|
||||
if debug:
|
||||
logger.out("Get past state and update if needed", state='d', prefix='main-thread')
|
||||
|
||||
past_state = zkhandler.read(('node.state.daemon', this_node.name))
|
||||
if past_state != 'run':
|
||||
if past_state != 'run' and past_state != 'shutdown':
|
||||
this_node.daemon_state = 'run'
|
||||
zkhandler.write([
|
||||
(('node.state.daemon', this_node.name), 'run')
|
||||
@ -1868,7 +1870,6 @@ def node_keepalive():
|
||||
])
|
||||
except Exception:
|
||||
logger.out('Failed to set keepalive data', state='e')
|
||||
return
|
||||
|
||||
# Display node information to the terminal
|
||||
if config['log_keepalives']:
|
||||
@ -1879,9 +1880,10 @@ def node_keepalive():
|
||||
else:
|
||||
cst_colour = fmt_cyan
|
||||
logger.out(
|
||||
'{}{} keepalive{} [{}{}{}]'.format(
|
||||
'{}{} keepalive @ {}{} [{}{}{}]'.format(
|
||||
fmt_purple,
|
||||
myhostname,
|
||||
datetime.now(),
|
||||
fmt_end,
|
||||
fmt_bold + cst_colour,
|
||||
this_node.router_state,
|
||||
|
Reference in New Issue
Block a user