Compare commits

..

16 Commits

Author SHA1 Message Date
18f09196be Bump version to 0.9.93 2024-01-30 09:51:21 -05:00
8419659e1b Ensure zkhandler is always cleaned up
Even if the subfunction of an API @ZKConnection call fails, the
zkhandler needs to terminate and clean up, or it leaves stuck threads
around.
2024-01-30 09:48:17 -05:00
df40b779af Bump version to 0.9.92 2024-01-29 09:39:10 -05:00
db4f0881a2 Improve error handling and retries
1. Use the actual response code from the server on error, or 504 on
timeouts instead of 500.
2. Retry GET requests 3 times and only error if the last fails
2024-01-29 09:35:14 -05:00
9b51fe9f10 Use get() for newer keys in client 2024-01-29 09:21:02 -05:00
a66449541d Improve script error handling and variables 2024-01-26 15:41:34 -05:00
d28fb71f57 Fix incorrect variable set 2024-01-24 14:40:40 -05:00
e5e9c7086a Add missing restore state to colours 2024-01-24 09:34:59 -05:00
f29b4c2755 Bump version to 0.9.91 2024-01-23 10:40:59 -05:00
0adec2be0d Use consistent and less error-prone find rm's 2024-01-23 10:40:48 -05:00
b994e1a26c Add cleanup of pycaches to CLI install 2024-01-23 10:22:50 -05:00
6d6420a695 Add missing value to vm_define function 2024-01-23 09:58:32 -05:00
94e0287fc4 Add missing modules to default cloud-init 2024-01-21 14:23:44 -05:00
2886176762 Improve handling of task arg display
Shows each subarg of the task_args as its own element, if applicable,
and fits the width to the terminal using MAX_CONTENT_WIDTH instead of an
arbitrary value.
2024-01-18 13:00:48 -05:00
4dc4c975f1 Add status messages during task query 2024-01-18 12:38:53 -05:00
8f3120baf3 Avoid errors if task_status is a tuple 2024-01-18 12:30:31 -05:00
25 changed files with 250 additions and 59 deletions

View File

@ -1 +1 @@
0.9.90
0.9.93

View File

@ -1,5 +1,25 @@
## PVC Changelog
###### [v0.9.93](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.93)
* [API Daemon] Fixes a bug where stuck zkhandler threads were not cleaned up on error
###### [v0.9.92](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.92)
* [CLI Client] Adds the new restore state to the colours list for VM status
* [API Daemon] Fixes an incorrect variable assignment
* [Provisioner] Improves the error handling of various steps in the debootstrap and rinse example scripts
* [CLI Client] Fixes two bugs around missing keys that were added recently (uses get() instead direct dictionary refs)
* [CLI Client] Improves API error handling via GET retries (x3) and better server status code handling
###### [v0.9.91](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.91)
* [Client CLI] Fixes a bug and improves output during cluster task events.
* [Client CLI] Improves the output of the task list display.
* [Provisioner] Fixes some missing cloud-init modules in the default debootstrap script.
* [Client CLI] Fixes a bug with a missing argument to the vm_define helper function.
* [All] Fixes inconsistent package find + rm commands to avoid errors in dpkg.
###### [v0.9.90](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.90)
* [Client CLI/API Daemon] Adds additional backup metainfo and an emailed report option to autobackups.

View File

@ -150,6 +150,10 @@
from daemon_lib.vmbuilder import VMBuilder
# These are some global variables used below
default_root_password = "test123"
# The VMBuilderScript class must be named as such, and extend VMBuilder.
class VMBuilderScript(VMBuilder):
def setup(self):
@ -498,11 +502,15 @@ class VMBuilderScript(VMBuilder):
ret = os.system(
f"debootstrap --include={','.join(deb_packages)} {deb_release} {temp_dir} {deb_mirror}"
)
ret = int(ret >> 8)
if ret > 0:
self.fail("Failed to run debootstrap")
self.fail(f"Debootstrap failed with exit code {ret}")
# Bind mount the devfs so we can grub-install later
os.system("mount --bind /dev {}/dev".format(temp_dir))
ret = os.system("mount --bind /dev {}/dev".format(temp_dir))
ret = int(ret >> 8)
if ret > 0:
self.fail(f"/dev bind mount failed with exit code {ret}")
# Create an fstab entry for each volume
fstab_file = "{}/etc/fstab".format(temp_dir)
@ -589,11 +597,13 @@ After=multi-user.target
- migrator
- bootcmd
- write-files
- growpart
- resizefs
- set_hostname
- update_hostname
- update_etc_hosts
- ca-certs
- users-groups
- ssh
cloud_config_modules:
@ -686,23 +696,36 @@ GRUB_DISABLE_LINUX_UUID=false
# Do some tasks inside the chroot using the provided context manager
with chroot(temp_dir):
# Install and update GRUB
os.system(
ret = os.system(
"grub-install --force /dev/rbd/{}/{}_{}".format(
root_volume["pool"], vm_name, root_volume["disk_id"]
)
)
os.system("update-grub")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"GRUB install failed with exit code {ret}")
ret = os.system("update-grub")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"GRUB update failed with exit code {ret}")
# Set a really dumb root password so the VM can be debugged
# EITHER CHANGE THIS YOURSELF, here or in Userdata, or run something after install
# to change the root password: don't leave it like this on an Internet-facing machine!
os.system("echo root:test123 | chpasswd")
ret = os.system(f"echo root:{default_root_password} | chpasswd")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Root password change failed with exit code {ret}")
# Enable cloud-init target on (first) boot
# Your user-data should handle this and disable it once done, or things get messy.
# That cloud-init won't run without this hack seems like a bug... but even the official
# Debian cloud images are affected, so who knows.
os.system("systemctl enable cloud-init.target")
ret = os.system("systemctl enable cloud-init.target")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Enable of cloud-init failed with exit code {ret}")
def cleanup(self):
"""
@ -727,7 +750,7 @@ GRUB_DISABLE_LINUX_UUID=false
temp_dir = "/tmp/target"
# Unmount the bound devfs
os.system("umount {}/dev".format(temp_dir))
os.system("umount -f {}/dev".format(temp_dir))
# Use this construct for reversing the list, as the normal reverse() messes with the list
for volume in list(reversed(self.vm_data["volumes"])):
@ -744,7 +767,7 @@ GRUB_DISABLE_LINUX_UUID=false
):
# Unmount filesystem
retcode, stdout, stderr = pvc_common.run_os_command(
f"umount {mount_path}"
f"umount -f {mount_path}"
)
if retcode:
self.log_err(

View File

@ -150,6 +150,11 @@
from daemon_lib.vmbuilder import VMBuilder
# These are some global variables used below
default_root_password = "test123"
default_local_time = "UTC"
# The VMBuilderScript class must be named as such, and extend VMBuilder.
class VMBuilderScript(VMBuilder):
def setup(self):
@ -524,13 +529,23 @@ class VMBuilderScript(VMBuilder):
ret = os.system(
f"rinse --arch {rinse_architecture} --directory {temporary_directory} --distribution {rinse_release} --cache-dir {rinse_cache} --add-pkg-list /tmp/addpkg --verbose {mirror_arg}"
)
ret = int(ret >> 8)
if ret > 0:
self.fail("Failed to run rinse")
self.fail(f"Rinse failed with exit code {ret}")
# Bind mount the devfs, sysfs, and procfs so we can grub-install later
os.system("mount --bind /dev {}/dev".format(temporary_directory))
os.system("mount --bind /sys {}/sys".format(temporary_directory))
os.system("mount --bind /proc {}/proc".format(temporary_directory))
ret = os.system("mount --bind /dev {}/dev".format(temporary_directory))
ret = int(ret >> 8)
if ret > 0:
self.fail(f"/dev bind mount failed with exit code {ret}")
ret = os.system("mount --bind /sys {}/sys".format(temporary_directory))
ret = int(ret >> 8)
if ret > 0:
self.fail(f"/sys bind mount failed with exit code {ret}")
ret = os.system("mount --bind /proc {}/proc".format(temporary_directory))
ret = int(ret >> 8)
if ret > 0:
self.fail(f"/proc bind mount failed with exit code {ret}")
# Create an fstab entry for each volume
fstab_file = "{}/etc/fstab".format(temporary_directory)
@ -642,41 +657,76 @@ GRUB_SERIAL_COMMAND="serial --speed=115200 --unit=0 --word=8 --parity=no --stop=
# Do some tasks inside the chroot using the provided context manager
with chroot(temporary_directory):
# Fix the broken kernel from rinse by setting a systemd machine ID and running the post scripts
os.system("systemd-machine-id-setup")
os.system(
ret = os.system("systemd-machine-id-setup")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Machine ID setup failed with exit code {ret}")
ret = os.system(
"rpm -q --scripts kernel-core | grep -A20 'posttrans scriptlet' | tail -n+2 | bash -x"
)
ret = int(ret >> 8)
if ret > 0:
self.fail(f"RPM kernel reinstall failed with exit code {ret}")
# Install any post packages
os.system(f"dnf install -y {' '.join(post_packages)}")
if len(post_packages) > 0:
ret = os.system(f"dnf install -y {' '.join(post_packages)}")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"DNF install failed with exit code {ret}")
# Install and update GRUB config
os.system(
ret = os.system(
"grub2-install --force /dev/rbd/{}/{}_{}".format(
root_volume["pool"], vm_name, root_volume["disk_id"]
)
)
ret = int(ret >> 8)
if ret > 0:
self.fail(f"GRUB install failed with exit code {ret}")
os.system("grub2-mkconfig -o /boot/grub2/grub.cfg")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"GRUB update failed with exit code {ret}")
# Set a really dumb root password so the VM can be debugged
# EITHER CHANGE THIS YOURSELF, here or in Userdata, or run something after install
# to change the root password: don't leave it like this on an Internet-facing machine!
os.system("echo root:test123 | chpasswd")
ret = os.system(f"echo root:{default_root_password} | chpasswd")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Root password change failed with exit code {ret}")
# Enable dbus-broker
os.system("systemctl enable dbus-broker.service")
ret = os.system("systemctl enable dbus-broker.service")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Enable of dbus-broker failed with exit code {ret}")
# Enable NetworkManager
os.system("systemctl enable NetworkManager.service")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Enable of NetworkManager failed with exit code {ret}")
# Enable cloud-init target on (first) boot
# Your user-data should handle this and disable it once done, or things get messy.
# That cloud-init won't run without this hack seems like a bug... but even the official
# Debian cloud images are affected, so who knows.
os.system("systemctl enable cloud-init.target")
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Enable of cloud-init failed with exit code {ret}")
# Set the timezone to UTC
os.system("ln -sf ../usr/share/zoneinfo/UTC /etc/localtime")
ret = os.system(
f"ln -sf ../usr/share/zoneinfo/{default_local_time} /etc/localtime"
)
ret = int(ret >> 8)
if ret > 0:
self.fail(f"Localtime update failed with exit code {ret}")
def cleanup(self):
"""

View File

@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
import daemon_lib.config as cfg
# Daemon version
version = "0.9.90"
version = "0.9.93"
# API version
API_VERSION = 1.0

View File

@ -687,7 +687,10 @@ def cli_cluster_task(task_id, wait_flag, format_function):
if wait_flag:
# First validate that this is actually a valid task that is running
echo(CLI_CONFIG, "Querying cluster for tasks...", newline=False)
retcode, retdata = pvc.lib.common.task_status(CLI_CONFIG, None)
echo(CLI_CONFIG, " done.")
echo(CLI_CONFIG, "")
if task_id in [i["id"] for i in retdata]:
task = [i for i in retdata if i["id"] == task_id][0]
retmsg = wait_for_celery_task(
@ -699,7 +702,10 @@ def cli_cluster_task(task_id, wait_flag, format_function):
retmsg = f"No task with ID {task_id} found."
finish(retcode, retmsg)
else:
echo(CLI_CONFIG, "Querying cluster for tasks...", newline=False)
retcode, retdata = pvc.lib.common.task_status(CLI_CONFIG, task_id)
echo(CLI_CONFIG, " done.")
echo(CLI_CONFIG, "")
finish(retcode, retdata, format_function)

View File

@ -645,6 +645,24 @@ def cli_cluster_task_format_pretty(CLI_CONFIG, task_data):
if _task_type_length > task_type_length:
task_type_length = _task_type_length
for arg_name, arg_data in task["kwargs"].items():
# Skip the "run_on" argument
if arg_name == "run_on":
continue
# task_arg_name column
_task_arg_name_length = len(str(arg_name)) + 1
if _task_arg_name_length > task_arg_name_length:
task_arg_name_length = _task_arg_name_length
task_header_length = (
task_id_length + task_name_length + task_type_length + task_worker_length + 3
)
max_task_data_length = (
MAX_CONTENT_WIDTH - task_header_length - task_arg_name_length - 2
)
for task in task_data:
updated_kwargs = list()
for arg_name, arg_data in task["kwargs"].items():
# Skip the "run_on" argument
@ -656,15 +674,30 @@ def cli_cluster_task_format_pretty(CLI_CONFIG, task_data):
if _task_arg_name_length > task_arg_name_length:
task_arg_name_length = _task_arg_name_length
if len(str(arg_data)) > 17:
arg_data = arg_data[:17] + "..."
if isinstance(arg_data, list):
for subarg_data in arg_data:
if len(subarg_data) > max_task_data_length:
subarg_data = (
str(subarg_data[: max_task_data_length - 4]) + " ..."
)
# task_arg_data column
_task_arg_data_length = len(str(arg_data)) + 1
if _task_arg_data_length > task_arg_data_length:
task_arg_data_length = _task_arg_data_length
# task_arg_data column
_task_arg_data_length = len(str(subarg_data)) + 1
if _task_arg_data_length > task_arg_data_length:
task_arg_data_length = _task_arg_data_length
updated_kwargs.append({"name": arg_name, "data": subarg_data})
else:
if len(str(arg_data)) > 24:
arg_data = str(arg_data[:24]) + " ..."
# task_arg_data column
_task_arg_data_length = len(str(arg_data)) + 1
if _task_arg_data_length > task_arg_data_length:
task_arg_data_length = _task_arg_data_length
updated_kwargs.append({"name": arg_name, "data": arg_data})
updated_kwargs.append({"name": arg_name, "data": arg_data})
task["kwargs"] = updated_kwargs
tasks.append(task)

View File

@ -115,6 +115,8 @@ def wait_for_celery_task(CLI_CONFIG, task_detail, start_late=False):
)
while True:
sleep(0.5)
if isinstance(task_status, tuple):
continue
if task_status.get("state") != "RUNNING":
break
if task_status.get("current") > last_task:

View File

@ -140,15 +140,31 @@ def call_api(
# Determine the request type and hit the API
disable_warnings()
try:
response = None
if operation == "get":
response = requests.get(
uri,
timeout=timeout,
headers=headers,
params=params,
data=data,
verify=config["verify_ssl"],
)
retry_on_code = [429, 500, 502, 503, 504]
for i in range(3):
failed = False
try:
response = requests.get(
uri,
timeout=timeout,
headers=headers,
params=params,
data=data,
verify=config["verify_ssl"],
)
if response.status_code in retry_on_code:
failed = True
continue
except requests.exceptions.ConnectionError:
failed = True
pass
if failed:
error = f"Code {response.status_code}" if response else "Timeout"
raise requests.exceptions.ConnectionError(
f"Failed to connect after 3 tries ({error})"
)
if operation == "post":
response = requests.post(
uri,
@ -189,7 +205,8 @@ def call_api(
)
except Exception as e:
message = "Failed to connect to the API: {}".format(e)
response = ErrorResponse({"message": message}, 500)
code = response.status_code if response else 504
response = ErrorResponse({"message": message}, code)
# Display debug output
if config["debug"]:

View File

@ -430,7 +430,7 @@ def format_list_osd(config, osd_list):
)
continue
if osd_information["is_split"]:
if osd_information.get("is_split") is not None:
osd_information["device"] = f"{osd_information['device']} [s]"
# Deal with the size to human readable

View File

@ -89,6 +89,7 @@ def vm_define(
node_selector,
node_autostart,
migration_method,
migration_max_downtime,
user_tags,
protected_tags,
):
@ -96,7 +97,7 @@ def vm_define(
Define a new VM on the cluster
API endpoint: POST /vm
API arguments: xml={xml}, node={node}, limit={node_limit}, selector={node_selector}, autostart={node_autostart}, migration_method={migration_method}, user_tags={user_tags}, protected_tags={protected_tags}
API arguments: xml={xml}, node={node}, limit={node_limit}, selector={node_selector}, autostart={node_autostart}, migration_method={migration_method}, migration_max_downtime={migration_max_downtime}, user_tags={user_tags}, protected_tags={protected_tags}
API schema: {"message":"{data}"}
"""
params = {
@ -105,6 +106,7 @@ def vm_define(
"selector": node_selector,
"autostart": node_autostart,
"migration_method": migration_method,
"migration_max_downtime": migration_max_downtime,
"user_tags": user_tags,
"protected_tags": protected_tags,
}
@ -1630,6 +1632,7 @@ def format_info(config, domain_information, long_output):
"migrate": ansiprint.blue(),
"unmigrate": ansiprint.blue(),
"provision": ansiprint.blue(),
"restore": ansiprint.blue(),
}
ainformation.append(
"{}State:{} {}{}{}".format(
@ -1714,7 +1717,7 @@ def format_info(config, domain_information, long_output):
"{}Max live downtime:{} {}".format(
ansiprint.purple(),
ansiprint.end(),
f"{domain_information['migration_max_downtime']} ms",
f"{domain_information.get('migration_max_downtime')} ms",
)
)

View File

@ -2,7 +2,7 @@ from setuptools import setup
setup(
name="pvc",
version="0.9.90",
version="0.9.93",
packages=["pvc.cli", "pvc.lib"],
install_requires=[
"Click",

View File

@ -1201,7 +1201,7 @@ def get_resource_metrics(zkhandler):
try:
user_time = vm["vcpu_stats"]["user_time"] / 1000000
except Exception:
cpu_time = 0
user_time = 0
output_lines.append(
f"pvc_vm_vcpus_user_time{{vm=\"{vm['name']}\"}} {user_time}"
)

View File

@ -57,10 +57,11 @@ class ZKConnection(object):
schema_version = 0
zkhandler.schema.load(schema_version, quiet=True)
ret = function(zkhandler, *args, **kwargs)
zkhandler.disconnect()
del zkhandler
try:
ret = function(zkhandler, *args, **kwargs)
finally:
zkhandler.disconnect()
del zkhandler
return ret

26
debian/changelog vendored
View File

@ -1,3 +1,29 @@
pvc (0.9.93-0) unstable; urgency=high
* [API Daemon] Fixes a bug where stuck zkhandler threads were not cleaned up on error
-- Joshua M. Boniface <joshua@boniface.me> Tue, 30 Jan 2024 09:51:21 -0500
pvc (0.9.92-0) unstable; urgency=high
* [CLI Client] Adds the new restore state to the colours list for VM status
* [API Daemon] Fixes an incorrect variable assignment
* [Provisioner] Improves the error handling of various steps in the debootstrap and rinse example scripts
* [CLI Client] Fixes two bugs around missing keys that were added recently (uses get() instead direct dictionary refs)
* [CLI Client] Improves API error handling via GET retries (x3) and better server status code handling
-- Joshua M. Boniface <joshua@boniface.me> Mon, 29 Jan 2024 09:39:10 -0500
pvc (0.9.91-0) unstable; urgency=high
* [Client CLI] Fixes a bug and improves output during cluster task events.
* [Client CLI] Improves the output of the task list display.
* [Provisioner] Fixes some missing cloud-init modules in the default debootstrap script.
* [Client CLI] Fixes a bug with a missing argument to the vm_define helper function.
* [All] Fixes inconsistent package find + rm commands to avoid errors in dpkg.
-- Joshua M. Boniface <joshua@boniface.me> Tue, 23 Jan 2024 10:02:19 -0500
pvc (0.9.90-0) unstable; urgency=high
* [Client CLI/API Daemon] Adds additional backup metainfo and an emailed report option to autobackups.

View File

@ -2,7 +2,12 @@
# Generate the bash completion configuration
if [ -d /etc/bash_completion.d ]; then
echo "Installing BASH completion configuration"
_PVC_COMPLETE=source_bash pvc > /etc/bash_completion.d/pvc
fi
# Remove any cached CPython directories or files
echo "Cleaning up CPython caches"
find /usr/lib/python3/dist-packages/pvc -type d -name "__pycache__" -exec rm -fr {} + &>/dev/null || true
exit 0

View File

@ -1,5 +1,5 @@
#!/bin/sh
# Remove any cached CPython directories or files
echo "Cleaning up existing CPython files"
find /usr/share/pvc/pvcapid -type d -name "__pycache__" -exec rm -rf {} \; &>/dev/null || true
echo "Cleaning up CPython caches"
find /usr/share/pvc/pvcapid -type d -name "__pycache__" -exec rm -fr {} + &>/dev/null || true

5
debian/pvc-daemon-common.preinst vendored Normal file
View File

@ -0,0 +1,5 @@
#!/bin/sh
# Remove any cached CPython directories or files
echo "Cleaning up CPython caches"
find /usr/share/pvc/daemon_lib -type d -name "__pycache__" -exec rm -fr {} + &>/dev/null || true

View File

@ -1,6 +1,6 @@
#!/bin/sh
# Remove any cached CPython directories or files
echo "Cleaning up existing CPython files"
find /usr/share/pvc/pvchealthd -type d -name "__pycache__" -exec rm -rf {} \; &>/dev/null || true
find /usr/share/pvc/plugins -type d -name "__pycache__" -exec rm -rf {} \; &>/dev/null || true
echo "Cleaning up CPython caches"
find /usr/share/pvc/pvchealthd -type d -name "__pycache__" -exec rm -fr {} + &>/dev/null || true
find /usr/share/pvc/plugins -type d -name "__pycache__" -exec rm -fr {} + &>/dev/null || true

View File

@ -1,5 +1,5 @@
#!/bin/sh
# Remove any cached CPython directories or files
echo "Cleaning up existing CPython files"
find /usr/share/pvc/pvcnoded -type d -name "__pycache__" -exec rm -rf {} \; &>/dev/null || true
echo "Cleaning up CPython caches"
find /usr/share/pvc/pvcnoded -type d -name "__pycache__" -exec rm -fr {} + &>/dev/null || true

View File

@ -1,5 +1,5 @@
#!/bin/sh
# Remove any cached CPython directories or files
echo "Cleaning up existing CPython files"
find /usr/share/pvc/pvcworkerd -type d -name "__pycache__" -exec rm -rf {} \; &>/dev/null || true
echo "Cleaning up CPython caches"
find /usr/share/pvc/pvcworkerd -type d -name "__pycache__" -exec rm -fr {} + &>/dev/null || true

2
debian/rules vendored
View File

@ -13,7 +13,7 @@ override_dh_python3:
rm -r $(CURDIR)/client-cli/.pybuild $(CURDIR)/client-cli/pvc.egg-info
override_dh_auto_clean:
find . -name "__pycache__" -o -name ".pybuild" -exec rm -r {} \; || true
find . -name "__pycache__" -o -name ".pybuild" -exec rm -fr {} + || true
# If you need to rebuild the Sphinx documentation
# Add spinxdoc to the dh --with line

View File

@ -33,7 +33,7 @@ import os
import signal
# Daemon version
version = "0.9.90"
version = "0.9.93"
##########################################################

View File

@ -49,7 +49,7 @@ import re
import json
# Daemon version
version = "0.9.90"
version = "0.9.93"
##########################################################

View File

@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
)
# Daemon version
version = "0.9.90"
version = "0.9.93"
config = cfg.get_configuration()