Compare commits
177 Commits
Author | SHA1 | Date | |
---|---|---|---|
51e78480fa | |||
c4397219da | |||
f46bfc962f | |||
714d4b6005 | |||
fa8329ac3d | |||
457b7bed3d | |||
86115b2928 | |||
1a906b589e | |||
7b230d8bd5 | |||
48662e90c1 | |||
079381c03e | |||
794cea4a02 | |||
fa24f3ba75 | |||
caadafa80d | |||
479e156234 | |||
86830286f3 | |||
4d51318a40 | |||
cba6f5be48 | |||
254303b9d4 | |||
40b7d68853 | |||
79d871ebc6 | |||
311bb69785 | |||
dfcfe4df4a | |||
a8115cafd1 | |||
570da99605 | |||
fdda47e8a2 | |||
a5ffe373cd | |||
bb2aac145d | |||
a7c1b91f60 | |||
ec6d3351b2 | |||
22faaa9bbc | |||
6c407d54c3 | |||
9ba7aa5b08 | |||
cb413e5ce6 | |||
123499f75f | |||
83b8ce7b62 | |||
5e43f9bd7c | |||
ed087d83c2 | |||
83d475bd15 | |||
3d5cdf2b23 | |||
705ec802a3 | |||
47d7b23763 | |||
0bae729a18 | |||
b968110e9f | |||
4898ae5082 | |||
249e1568a1 | |||
0b90f37518 | |||
33205273dc | |||
1e083d7652 | |||
65d2b7869c | |||
66aee73f1d | |||
075dbe7cc9 | |||
2ff7a6865b | |||
2002394a51 | |||
0e8bdfad15 | |||
b5f996febd | |||
3a4914fa5e | |||
dcda7b5748 | |||
ae7950e9b7 | |||
d769071799 | |||
e298d10561 | |||
fc8cf9ed44 | |||
4ccdd6347e | |||
b32f478633 | |||
cf442fcc2d | |||
b753f85410 | |||
d2bcaec28f | |||
a70273dbae | |||
30ebd6b42c | |||
b2e6feeba3 | |||
c9b06ffdb2 | |||
a032dcc5c8 | |||
01122415f6 | |||
bd3e3829b3 | |||
e01bbe9764 | |||
3e7953531c | |||
c7b7ad0cf7 | |||
776daac267 | |||
653b95ee25 | |||
59c9d89986 | |||
e294e1c087 | |||
4685ba1ec4 | |||
969091ed22 | |||
148f04b256 | |||
dc9e43fbee | |||
d8dcec254d | |||
3a90fda109 | |||
78322f4de4 | |||
c1782c5004 | |||
9114255af5 | |||
b26bb5cb65 | |||
74c4ce3ec7 | |||
2c3a3cdf52 | |||
0b583bfdaf | |||
7c07fbefff | |||
202dc3ed59 | |||
8667f4d03b | |||
4c2d99f8a6 | |||
bcff6650d0 | |||
a11206253d | |||
7f57c6dbf7 | |||
6865979e08 | |||
5126bc3272 | |||
765f0ef13d | |||
fe258d9d56 | |||
93d89a2414 | |||
a49f3810d3 | |||
45ad3b9a17 | |||
07623fad1a | |||
8331b7ecd8 | |||
94d4ee5b9b | |||
e773211293 | |||
32c36c866b | |||
dc4e56db4b | |||
e45b3108a2 | |||
118237a53b | |||
9805681f94 | |||
6c9abb2abe | |||
a1122c6e71 | |||
3696f81597 | |||
5ca0d903b6 | |||
6ddbde763e | |||
626424b74a | |||
b3d99827f5 | |||
c9ceb3159b | |||
6525a2568b | |||
09a005d3d7 | |||
96defebd0b | |||
d00b8aa6cd | |||
e9aa545e9b | |||
fb0fcc0597 | |||
3009f24910 | |||
5ae836f1c5 | |||
70ba364f1d | |||
eda1b95d5f | |||
3bd93563e6 | |||
1f8561d59a | |||
a2efc83953 | |||
f2d2537e1c | |||
1093ca6264 | |||
15ff729f83 | |||
29584e5636 | |||
f4e8449356 | |||
388f6556c0 | |||
ec79acf061 | |||
6c7be492b8 | |||
00586074cf | |||
f4eef30770 | |||
8565cf26b3 | |||
0ecf219910 | |||
0f4edc54d1 | |||
ca91be51e1 | |||
e29d0e89eb | |||
14d29f2986 | |||
bc88d764b0 | |||
a3c31564ca | |||
b07396c39a | |||
71139fa66d | |||
e6f9e6e0e8 | |||
1ea4800212 | |||
9c14d84bfc | |||
d8f346abdd | |||
2ee52e44d3 | |||
3c742a827b | |||
aeb238f43c | |||
671a907236 | |||
e945fd8590 | |||
a49510ecc8 | |||
6d7730ab52 | |||
8135426973 | |||
20d436a745 | |||
28f6819726 | |||
35c07f0384 | |||
6127387be4 | |||
343d66875b | |||
92feeefd26 | |||
38d63d9837 |
6
.flake8
@ -3,10 +3,12 @@
|
||||
# * W503 (line break before binary operator): Black moves these to new lines
|
||||
# * E501 (line too long): Long lines are a fact of life in comment blocks; Black handles active instances of this
|
||||
# * E203 (whitespace before ':'): Black recommends this as disabled
|
||||
ignore = W503, E501
|
||||
# * F403 (import * used; unable to detect undefined names): We use a wildcard for helpers
|
||||
# * F405 (possibly undefined name): We use a wildcard for helpers
|
||||
ignore = W503, E501, F403, F405
|
||||
extend-ignore = E203
|
||||
# We exclude the Debian, migrations, and provisioner examples
|
||||
exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples
|
||||
exclude = debian,api-daemon/migrations/versions,api-daemon/provisioner/examples,node-daemon/monitoring
|
||||
# Set the max line length to 88 for Black
|
||||
max-line-length = 88
|
||||
|
||||
|
105
CHANGELOG.md
@ -1,5 +1,110 @@
|
||||
## PVC Changelog
|
||||
|
||||
###### [v0.9.76](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.76)
|
||||
|
||||
* [API, Client CLI] Corrects some missing node states for fencing in status output
|
||||
|
||||
###### [v0.9.75](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.75)
|
||||
|
||||
* [Node Daemon] Adds a startup message about IPMI when succeeding
|
||||
* [Node Daemon] Fixes a bug in fencing allowing non-failing VMs to migrate
|
||||
* [Node Daemon] Adds rounding to load average in load plugin for consistency
|
||||
|
||||
###### [v0.9.74](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.74)
|
||||
|
||||
* [Docs] Removes docs from the main repo
|
||||
* [Client CLI] Ensures that "provision" VMs are shown in the right colour
|
||||
* [Node Daemon] Separates the node monitoring subsystem into its own thread with a longer, customizable update interval
|
||||
* [Node Daemon] Adds checks for PSU input power reundancy (psur) and hardware RAID (hwrd)
|
||||
* [Node Daemon] Updates when Keepalive start messages are printed (end of run, with runtime) to align with new monitoring messages
|
||||
|
||||
###### [v0.9.73](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.73)
|
||||
|
||||
* [Node Daemon] Fixes a bug creating monitoring instance
|
||||
|
||||
###### [v0.9.72](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.72)
|
||||
|
||||
* [CLI] Restores old functionality for default node value
|
||||
|
||||
###### [v0.9.71](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.71)
|
||||
|
||||
* [API] Adds API support for Debian Bookworm
|
||||
|
||||
###### [v0.9.70](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.70)
|
||||
|
||||
* [Node Daemon] Fixes several compatibility issues for Debian 12 "Bookworm"
|
||||
|
||||
###### [v0.9.69](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.69)
|
||||
|
||||
* [Node Daemon] Ensures that system load is always 2 decimal places on Bookworm
|
||||
* [Node Daemon] Fixes bug blocking primary takeover at DNS Aggregator start if Patroni is down
|
||||
|
||||
###### [v0.9.68](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.68)
|
||||
|
||||
* [CLI] Fixes another bug with network info view
|
||||
|
||||
###### [v0.9.67](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.67)
|
||||
|
||||
* [CLI] Fixes several more bugs in the refactored CLI
|
||||
|
||||
###### [v0.9.66](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.66)
|
||||
|
||||
* [CLI] Fixes a missing YAML import in CLI
|
||||
|
||||
###### [v0.9.65](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.65)
|
||||
|
||||
* [CLI] Fixes a bug in the node list filtering command
|
||||
* [CLI] Fixes a bug/default when no connection is specified
|
||||
|
||||
###### [v0.9.64](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.64)
|
||||
|
||||
**Breaking Change [CLI]**: The CLI client root commands have been reorganized. The following commands have changed:
|
||||
|
||||
* `pvc cluster` -> `pvc connection` (all subcommands)
|
||||
* `pvc task` -> `pvc cluster` (all subcommands)
|
||||
* `pvc maintenance` -> `pvc cluster maintenance`
|
||||
* `pvc status` -> `pvc cluster status`
|
||||
|
||||
Ensure you have updated to the latest version of the PVC Ansible repository before deploying this version or using PVC Ansible oneshot playbooks for management.
|
||||
|
||||
**Breaking Change [CLI]**: The `--restart` option for VM configuration changes now has an explicit `--no-restart` to disable restarting, or a prompt if neither is specified; `--unsafe` no longer bypasses this prompt which was a bug. Applies to most `vm <cmd> set` commands like `vm vcpu set`, `vm memory set`, etc. All instances also feature restart confirmation afterwards, which, if `--restart` is provided, will prompt for confirmation unless `--yes` or `--unsafe` is specified.
|
||||
|
||||
**Breaking Change [CLI]**: The `--long` option previously on some `info` commands no longer exists; use `-f long`/`--format long` instead.
|
||||
|
||||
* [CLI] Significantly refactors the CLI client code for consistency and cleanliness
|
||||
* [CLI] Implements `-f`/`--format` options for all `list` and `info` commands in a consistent way
|
||||
* [CLI] Changes the behaviour of VM modification options with "--restart" to provide a "--no-restart"; defaults to a prompt if neither is specified and ignores the "--unsafe" global entirely
|
||||
* [API] Fixes several bugs in the 3-debootstrap.py provisioner example script
|
||||
* [Node] Fixes some bugs around VM shutdown on node flush
|
||||
* [Documentation] Adds mentions of Ganeti and Harvester
|
||||
|
||||
###### [v0.9.63](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.63)
|
||||
|
||||
* Mentions Ganeti in the docs
|
||||
* Increases API timeout back to 2s
|
||||
* Adds .update-* configs to dpkg plugin
|
||||
* Adds full/nearfull OSD warnings
|
||||
* Improves size value handling for volumes
|
||||
|
||||
###### [v0.9.62](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.62)
|
||||
|
||||
* [all] Adds an enhanced health checking, monitoring, and reporting system for nodes and clusters
|
||||
* [cli] Adds a cluster detail command
|
||||
|
||||
###### [v0.9.61](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.61)
|
||||
|
||||
* [provisioner] Fixes a bug in network comparison
|
||||
* [api] Fixes a bug being unable to rename disabled VMs
|
||||
|
||||
###### [v0.9.60](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.60)
|
||||
|
||||
* [Provisioner] Cleans up several remaining bugs in the example scripts; they should all be valid now
|
||||
* [Provisioner] Adjust default libvirt schema to disable RBD caching for a 2x+ performance boost
|
||||
|
||||
###### [v0.9.59](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.59)
|
||||
|
||||
* [API] Flips the mem(prov) and mem(free) selectors making mem(free) the default for "mem" and "memprov" explicit
|
||||
|
||||
###### [v0.9.58](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.58)
|
||||
|
||||
* [API] Fixes a bug where migration selector could have case-sensitive operational faults
|
||||
|
@ -9,7 +9,7 @@
|
||||
|
||||
## What is PVC?
|
||||
|
||||
PVC is a Linux KVM-based hyperconverged infrastructure (HCI) virtualization cluster solution that is fully Free Software, scalable, redundant, self-healing, self-managing, and designed for administrator simplicity. It is an alternative to other HCI solutions such as Harvester, Nutanix, and VMWare, as well as to other common virtualization stacks such as ProxMox and OpenStack.
|
||||
PVC is a Linux KVM-based hyperconverged infrastructure (HCI) virtualization cluster solution that is fully Free Software, scalable, redundant, self-healing, self-managing, and designed for administrator simplicity. It is an alternative to other HCI solutions such as Ganeti, Harvester, Nutanix, and VMWare, as well as to other common virtualization stacks such as ProxMox and OpenStack.
|
||||
|
||||
PVC is a complete HCI solution, built from well-known and well-trusted Free Software tools, to assist an administrator in creating and managing a cluster of servers to run virtual machines, as well as self-managing several important aspects including storage failover, node failure and recovery, virtual machine failure and recovery, and network plumbing. It is designed to act consistently, reliably, and unobtrusively, letting the administrator concentrate on more important things.
|
||||
|
||||
|
@ -398,7 +398,7 @@ class VMBuilderScript(VMBuilder):
|
||||
if volume.get("source_volume") is not None:
|
||||
continue
|
||||
|
||||
if volume.get("filesystem") is None:
|
||||
if volume.get("filesystem") is None or volume.get("filesystem") == "swap":
|
||||
continue
|
||||
|
||||
mapped_dst_volume = f"/dev/rbd/{dst_volume}"
|
||||
@ -441,7 +441,7 @@ class VMBuilderScript(VMBuilder):
|
||||
|
||||
# The directory we mounted things on earlier during prepare(); this could very well
|
||||
# be exposed as a module-level variable if you so choose
|
||||
temporary_directory = "/tmp/target"
|
||||
temp_dir = "/tmp/target"
|
||||
|
||||
# Use these convenient aliases for later (avoiding lots of "self.vm_data" everywhere)
|
||||
vm_name = self.vm_name
|
||||
@ -469,11 +469,13 @@ class VMBuilderScript(VMBuilder):
|
||||
"grub-pc",
|
||||
"cloud-init",
|
||||
"python3-cffi-backend",
|
||||
"acpid",
|
||||
"acpi-support-base",
|
||||
"wget",
|
||||
]
|
||||
|
||||
# We need to know our root disk for later GRUB-ing
|
||||
root_disk = None
|
||||
root_volume = None
|
||||
for volume in volumes:
|
||||
if volume["mountpoint"] == "/":
|
||||
root_volume = volume
|
||||
@ -482,17 +484,17 @@ class VMBuilderScript(VMBuilder):
|
||||
|
||||
# Perform a debootstrap installation
|
||||
print(
|
||||
f"Installing system with debootstrap: debootstrap --include={','.join(deb_packages)} {deb_release} {temporary_directory} {deb_mirror}"
|
||||
f"Installing system with debootstrap: debootstrap --include={','.join(deb_packages)} {deb_release} {temp_dir} {deb_mirror}"
|
||||
)
|
||||
os.system(
|
||||
f"debootstrap --include={','.join(deb_packages)} {deb_release} {temporary_directory} {deb_mirror}"
|
||||
f"debootstrap --include={','.join(deb_packages)} {deb_release} {temp_dir} {deb_mirror}"
|
||||
)
|
||||
|
||||
# Bind mount the devfs so we can grub-install later
|
||||
os.system("mount --bind /dev {}/dev".format(temporary_directory))
|
||||
os.system("mount --bind /dev {}/dev".format(temp_dir))
|
||||
|
||||
# Create an fstab entry for each volume
|
||||
fstab_file = "{}/etc/fstab".format(temporary_directory)
|
||||
fstab_file = "{}/etc/fstab".format(temp_dir)
|
||||
# The volume ID starts at zero and increments by one for each volume in the fixed-order
|
||||
# volume list. This lets us work around the insanity of Libvirt IDs not matching guest IDs,
|
||||
# while still letting us have some semblance of control here without enforcing things
|
||||
@ -537,13 +539,13 @@ class VMBuilderScript(VMBuilder):
|
||||
volume_id += 1
|
||||
|
||||
# Write the hostname; you could also take an FQDN argument for this as an example
|
||||
hostname_file = "{}/etc/hostname".format(temporary_directory)
|
||||
hostname_file = "{}/etc/hostname".format(temp_dir)
|
||||
with open(hostname_file, "w") as fh:
|
||||
fh.write("{}".format(vm_name))
|
||||
|
||||
# Fix the cloud-init.target since it's broken by default in Debian 11
|
||||
cloudinit_target_file = "{}/etc/systemd/system/cloud-init.target".format(
|
||||
temporary_directory
|
||||
temp_dir
|
||||
)
|
||||
with open(cloudinit_target_file, "w") as fh:
|
||||
# We lose our indent on these raw blocks to preserve the apperance of the files
|
||||
@ -557,7 +559,7 @@ After=multi-user.target
|
||||
fh.write(data)
|
||||
|
||||
# Write the cloud-init configuration
|
||||
ci_cfg_file = "{}/etc/cloud/cloud.cfg".format(temporary_directory)
|
||||
ci_cfg_file = "{}/etc/cloud/cloud.cfg".format(temp_dir)
|
||||
with open(ci_cfg_file, "w") as fh:
|
||||
fh.write(
|
||||
"""
|
||||
@ -618,15 +620,15 @@ After=multi-user.target
|
||||
- arches: [default]
|
||||
failsafe:
|
||||
primary: {deb_mirror}
|
||||
"""
|
||||
).format(deb_mirror=deb_mirror)
|
||||
""".format(
|
||||
deb_mirror=deb_mirror
|
||||
)
|
||||
)
|
||||
|
||||
# Due to device ordering within the Libvirt XML configuration, the first Ethernet interface
|
||||
# will always be on PCI bus ID 2, hence the name "ens2".
|
||||
# Write a DHCP stanza for ens2
|
||||
ens2_network_file = "{}/etc/network/interfaces.d/ens2".format(
|
||||
temporary_directory
|
||||
)
|
||||
ens2_network_file = "{}/etc/network/interfaces.d/ens2".format(temp_dir)
|
||||
with open(ens2_network_file, "w") as fh:
|
||||
data = """auto ens2
|
||||
iface ens2 inet dhcp
|
||||
@ -634,7 +636,7 @@ iface ens2 inet dhcp
|
||||
fh.write(data)
|
||||
|
||||
# Write the DHCP config for ens2
|
||||
dhclient_file = "{}/etc/dhcp/dhclient.conf".format(temporary_directory)
|
||||
dhclient_file = "{}/etc/dhcp/dhclient.conf".format(temp_dir)
|
||||
with open(dhclient_file, "w") as fh:
|
||||
# We can use fstrings too, since PVC will always have Python 3.6+, though
|
||||
# using format() might be preferable for clarity in some situations
|
||||
@ -654,7 +656,7 @@ interface "ens2" {{
|
||||
fh.write(data)
|
||||
|
||||
# Write the GRUB configuration
|
||||
grubcfg_file = "{}/etc/default/grub".format(temporary_directory)
|
||||
grubcfg_file = "{}/etc/default/grub".format(temp_dir)
|
||||
with open(grubcfg_file, "w") as fh:
|
||||
data = """# Written by the PVC provisioner
|
||||
GRUB_DEFAULT=0
|
||||
@ -671,7 +673,7 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
fh.write(data)
|
||||
|
||||
# Do some tasks inside the chroot using the provided context manager
|
||||
with chroot(temporary_directory):
|
||||
with chroot(temp_dir):
|
||||
# Install and update GRUB
|
||||
os.system(
|
||||
"grub-install --force /dev/rbd/{}/{}_{}".format(
|
||||
@ -704,16 +706,17 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
import os
|
||||
from pvcapid.vmbuilder import open_zk
|
||||
from pvcapid.Daemon import config
|
||||
import daemon_lib.common as pvc_common
|
||||
import daemon_lib.ceph as pvc_ceph
|
||||
|
||||
# Set the tempdir we used in the prepare() and install() steps
|
||||
# Set the temp_dir we used in the prepare() and install() steps
|
||||
temp_dir = "/tmp/target"
|
||||
|
||||
# Unmount the bound devfs
|
||||
os.system("umount {}/dev".format(temporary_directory))
|
||||
os.system("umount {}/dev".format(temp_dir))
|
||||
|
||||
# Use this construct for reversing the list, as the normal reverse() messes with the list
|
||||
for volume in list(reversed(self.vm_data["volumes"])):
|
||||
@ -725,6 +728,7 @@ GRUB_DISABLE_LINUX_UUID=false
|
||||
if (
|
||||
volume.get("source_volume") is None
|
||||
and volume.get("filesystem") is not None
|
||||
and volume.get("filesystem") != "swap"
|
||||
):
|
||||
# Unmount filesystem
|
||||
retcode, stdout, stderr = pvc_common.run_os_command(
|
||||
|
@ -20,7 +20,7 @@
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC provisioner script. It will create a
|
||||
# standard VM config and install a RHEL-like OS using rinse.
|
||||
# standard VM config and install a RHEL 8+ or similar OS using rinse.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC provisioner script and expanded upon as required.
|
||||
@ -398,7 +398,7 @@ class VMBuilderScript(VMBuilder):
|
||||
if volume.get("source_volume") is not None:
|
||||
continue
|
||||
|
||||
if volume.get("filesystem") is None:
|
||||
if volume.get("filesystem") is None or volume.get("filesystem") == "swap":
|
||||
continue
|
||||
|
||||
mapped_dst_volume = f"/dev/rbd/{dst_volume}"
|
||||
@ -487,7 +487,7 @@ class VMBuilderScript(VMBuilder):
|
||||
post_packages = ["cloud-init"]
|
||||
|
||||
# We need to know our root disk for later GRUB-ing
|
||||
root_disk = None
|
||||
root_volume = None
|
||||
for volume in volumes:
|
||||
if volume["mountpoint"] == "/":
|
||||
root_volume = volume
|
||||
@ -571,21 +571,6 @@ class VMBuilderScript(VMBuilder):
|
||||
with open(hostname_file, "w") as fh:
|
||||
fh.write("{}".format(vm_name))
|
||||
|
||||
# Fix the cloud-init.target since it's broken by default
|
||||
cloudinit_target_file = "{}/etc/systemd/system/cloud-init.target".format(
|
||||
temporary_directory
|
||||
)
|
||||
with open(cloudinit_target_file, "w") as fh:
|
||||
# We lose our indent on these raw blocks to preserve the apperance of the files
|
||||
# inside the VM itself
|
||||
data = """[Install]
|
||||
WantedBy=multi-user.target
|
||||
[Unit]
|
||||
Description=Cloud-init target
|
||||
After=multi-user.target
|
||||
"""
|
||||
fh.write(data)
|
||||
|
||||
# Due to device ordering within the Libvirt XML configuration, the first Ethernet interface
|
||||
# will always be on PCI bus ID 2, hence the name "ens2".
|
||||
# Write a DHCP stanza for ens2
|
||||
@ -682,11 +667,6 @@ GRUB_SERIAL_COMMAND="serial --speed=115200 --unit=0 --word=8 --parity=no --stop=
|
||||
# Set the timezone to UTC
|
||||
os.system("ln -sf ../usr/share/zoneinfo/UTC /etc/localtime")
|
||||
|
||||
# Unmount the bound devfs and sysfs
|
||||
os.system("umount {}/dev".format(temporary_directory))
|
||||
os.system("umount {}/sys".format(temporary_directory))
|
||||
os.system("umount {}/proc".format(temporary_directory))
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform any cleanup required due to prepare()/install()
|
||||
@ -700,6 +680,7 @@ GRUB_SERIAL_COMMAND="serial --speed=115200 --unit=0 --word=8 --parity=no --stop=
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
import os
|
||||
from pvcapid.vmbuilder import open_zk
|
||||
from pvcapid.Daemon import config
|
||||
import daemon_lib.common as pvc_common
|
||||
@ -708,6 +689,11 @@ GRUB_SERIAL_COMMAND="serial --speed=115200 --unit=0 --word=8 --parity=no --stop=
|
||||
# Set the tempdir we used in the prepare() and install() steps
|
||||
temp_dir = "/tmp/target"
|
||||
|
||||
# Unmount the bound devfs and sysfs
|
||||
os.system(f"umount {temp_dir}/dev")
|
||||
os.system(f"umount {temp_dir}/sys")
|
||||
os.system(f"umount {temp_dir}/proc")
|
||||
|
||||
# Use this construct for reversing the list, as the normal reverse() messes with the list
|
||||
for volume in list(reversed(self.vm_data["volumes"])):
|
||||
dst_volume_name = f"{self.vm_name}_{volume['disk_id']}"
|
||||
@ -718,6 +704,7 @@ GRUB_SERIAL_COMMAND="serial --speed=115200 --unit=0 --word=8 --parity=no --stop=
|
||||
if (
|
||||
volume.get("source_volume") is None
|
||||
and volume.get("filesystem") is not None
|
||||
and volume.get("filesystem") != "swap"
|
||||
):
|
||||
# Unmount filesystem
|
||||
retcode, stdout, stderr = pvc_common.run_os_command(
|
||||
@ -728,14 +715,14 @@ GRUB_SERIAL_COMMAND="serial --speed=115200 --unit=0 --word=8 --parity=no --stop=
|
||||
f"Failed to unmount '{mapped_dst_volume}' on '{mount_path}': {stderr}"
|
||||
)
|
||||
|
||||
# Unmap volume
|
||||
with open_zk(config) as zkhandler:
|
||||
success, message = pvc_ceph.unmap_volume(
|
||||
zkhandler,
|
||||
volume["pool"],
|
||||
dst_volume_name,
|
||||
)
|
||||
if not success:
|
||||
raise ProvisioningError(
|
||||
f"Failed to unmap '{mapped_dst_volume}': {stderr}"
|
||||
)
|
||||
# Unmap volume
|
||||
with open_zk(config) as zkhandler:
|
||||
success, message = pvc_ceph.unmap_volume(
|
||||
zkhandler,
|
||||
volume["pool"],
|
||||
dst_volume_name,
|
||||
)
|
||||
if not success:
|
||||
raise ProvisioningError(
|
||||
f"Failed to unmap '{mapped_dst_volume}': {stderr}"
|
||||
)
|
||||
|
@ -11,5 +11,16 @@ if [[ ! -f ${PVC_CONFIG_FILE} ]]; then
|
||||
fi
|
||||
|
||||
pushd /usr/share/pvc
|
||||
./pvcapid-manage.py db upgrade
|
||||
|
||||
case "$( cat /etc/debian_version )" in
|
||||
10.*|11.*)
|
||||
# Debian 10 & 11
|
||||
./pvcapid-manage_legacy.py db upgrade
|
||||
;;
|
||||
*)
|
||||
# Debian 12+
|
||||
flask --app ./pvcapid-manage_flask.py db upgrade
|
||||
;;
|
||||
esac
|
||||
|
||||
popd
|
||||
|
29
api-daemon/pvcapid-manage_flask.py
Executable file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# pvcapid-manage_flask.py - PVC Database management tasks (via Flask CLI)
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from pvcapid.flaskapi import app, db
|
||||
from pvcapid.models import * # noqa F401,F403
|
||||
|
||||
from flask_migrate import Migrate
|
||||
|
||||
migrate = Migrate(app, db)
|
||||
|
||||
# Call flask --app /usr/share/pvc/pvcapid-manage_flask.py db upgrade
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# manage.py - PVC Database management tasks
|
||||
# pvcapid-manage_legacy.py - PVC Database management tasks (Legacy)
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
@ -19,8 +19,7 @@
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from flask_migrate import Migrate, MigrateCommand
|
||||
from flask_script import Manager
|
||||
from flask_migrate import Migrate, MigrateCommand, Manager
|
||||
|
||||
from pvcapid.flaskapi import app, db
|
||||
from pvcapid.models import * # noqa F401,F403
|
@ -27,12 +27,8 @@ case "$( cat /etc/debian_version )" in
|
||||
10.*)
|
||||
CELERY_ARGS="worker --app pvcapid.flaskapi.celery --concurrency 1 --loglevel INFO"
|
||||
;;
|
||||
11.*)
|
||||
CELERY_ARGS="--app pvcapid.flaskapi.celery worker --concurrency 1 --loglevel INFO"
|
||||
;;
|
||||
*)
|
||||
echo "Invalid Debian version found!"
|
||||
exit 1
|
||||
CELERY_ARGS="--app pvcapid.flaskapi.celery worker --concurrency 1 --loglevel INFO"
|
||||
;;
|
||||
esac
|
||||
|
||||
|
@ -27,7 +27,7 @@ from ssl import SSLContext, TLSVersion
|
||||
from distutils.util import strtobool as dustrtobool
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.58"
|
||||
version = "0.9.76"
|
||||
|
||||
# API version
|
||||
API_VERSION = 1.0
|
||||
|
@ -80,6 +80,7 @@ celery.conf.update(app.config)
|
||||
# Custom decorators
|
||||
#
|
||||
|
||||
|
||||
# Request parser decorator
|
||||
class RequestParser(object):
|
||||
def __init__(self, reqargs):
|
||||
@ -164,6 +165,7 @@ def run_benchmark(self, pool):
|
||||
# API Root/Authentication
|
||||
##########################################################
|
||||
|
||||
|
||||
# /
|
||||
class API_Root(Resource):
|
||||
def get(self):
|
||||
@ -448,18 +450,48 @@ class API_Status(Resource):
|
||||
type: object
|
||||
id: ClusterStatus
|
||||
properties:
|
||||
health:
|
||||
cluster_health:
|
||||
type: object
|
||||
properties:
|
||||
health:
|
||||
type: integer
|
||||
description: The overall health (%) of the cluster
|
||||
example: 100
|
||||
messages:
|
||||
type: array
|
||||
description: A list of health event strings
|
||||
items:
|
||||
type: string
|
||||
example: "hv1: plugin 'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
|
||||
node_health:
|
||||
type: object
|
||||
properties:
|
||||
hvX:
|
||||
type: object
|
||||
description: A node entry for per-node health details, one per node in the cluster
|
||||
properties:
|
||||
health:
|
||||
type: integer
|
||||
description: The health (%) of the node
|
||||
example: 100
|
||||
messages:
|
||||
type: array
|
||||
description: A list of health event strings
|
||||
items:
|
||||
type: string
|
||||
example: "'nics': bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
|
||||
maintenance:
|
||||
type: string
|
||||
description: The overall cluster health
|
||||
example: Optimal
|
||||
storage_health:
|
||||
type: string
|
||||
description: The overall storage cluster health
|
||||
example: Optimal
|
||||
description: Whether the cluster is in maintenance mode or not (string boolean)
|
||||
example: true
|
||||
primary_node:
|
||||
type: string
|
||||
description: The current primary coordinator node
|
||||
example: pvchv1
|
||||
pvc_version:
|
||||
type: string
|
||||
description: The PVC version of the current primary coordinator node
|
||||
example: 0.9.61
|
||||
upstream_ip:
|
||||
type: string
|
||||
description: The cluster upstream IP address in CIDR format
|
||||
@ -556,6 +588,7 @@ api.add_resource(API_Status, "/status")
|
||||
# Client API - Node
|
||||
##########################################################
|
||||
|
||||
|
||||
# /node
|
||||
class API_Node_Root(Resource):
|
||||
@RequestParser(
|
||||
@ -605,6 +638,38 @@ class API_Node_Root(Resource):
|
||||
arch:
|
||||
type: string
|
||||
description: The architecture of the CPU
|
||||
health:
|
||||
type: integer
|
||||
description: The overall health (%) of the node
|
||||
example: 100
|
||||
health_plugins:
|
||||
type: array
|
||||
description: A list of health plugin names currently loaded on the node
|
||||
items:
|
||||
type: string
|
||||
example: "nics"
|
||||
health_details:
|
||||
type: array
|
||||
description: A list of health plugin results
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
description: The name of the health plugin
|
||||
example: nics
|
||||
last_run:
|
||||
type: integer
|
||||
description: The UNIX timestamp (s) of the last plugin run
|
||||
example: 1676786078
|
||||
health_delta:
|
||||
type: integer
|
||||
description: The health delta (negatively applied to the health percentage) of the plugin's current state
|
||||
example: 10
|
||||
message:
|
||||
type: string
|
||||
description: The output message of the plugin
|
||||
example: "bond0 DEGRADED with 1 active slaves, bond0 OK at 10000 Mbps"
|
||||
load:
|
||||
type: number
|
||||
format: float
|
||||
@ -952,6 +1017,7 @@ api.add_resource(API_Node_Log, "/node/<node>/log")
|
||||
# Client API - VM
|
||||
##########################################################
|
||||
|
||||
|
||||
# /vm
|
||||
class API_VM_Root(Resource):
|
||||
@RequestParser(
|
||||
@ -1253,7 +1319,7 @@ class API_VM_Root(Resource):
|
||||
{"name": "node"},
|
||||
{
|
||||
"name": "selector",
|
||||
"choices": ("mem", "memfree", "vcpus", "load", "vms", "none"),
|
||||
"choices": ("mem", "memprov", "vcpus", "load", "vms", "none"),
|
||||
"helptext": "A valid selector must be specified",
|
||||
},
|
||||
{"name": "autostart"},
|
||||
@ -1302,7 +1368,7 @@ class API_VM_Root(Resource):
|
||||
default: none
|
||||
enum:
|
||||
- mem
|
||||
- memfree
|
||||
- memprov
|
||||
- vcpus
|
||||
- load
|
||||
- vms
|
||||
@ -1400,7 +1466,7 @@ class API_VM_Element(Resource):
|
||||
{"name": "node"},
|
||||
{
|
||||
"name": "selector",
|
||||
"choices": ("mem", "memfree", "vcpus", "load", "vms", "none"),
|
||||
"choices": ("mem", "memprov", "vcpus", "load", "vms", "none"),
|
||||
"helptext": "A valid selector must be specified",
|
||||
},
|
||||
{"name": "autostart"},
|
||||
@ -1451,7 +1517,7 @@ class API_VM_Element(Resource):
|
||||
default: none
|
||||
enum:
|
||||
- mem
|
||||
- memfree
|
||||
- memprov
|
||||
- vcpus
|
||||
- load
|
||||
- vms
|
||||
@ -1650,7 +1716,7 @@ class API_VM_Metadata(Resource):
|
||||
{"name": "limit"},
|
||||
{
|
||||
"name": "selector",
|
||||
"choices": ("mem", "memfree", "vcpus", "load", "vms", "none"),
|
||||
"choices": ("mem", "memprov", "vcpus", "load", "vms", "none"),
|
||||
"helptext": "A valid selector must be specified",
|
||||
},
|
||||
{"name": "autostart"},
|
||||
@ -1682,7 +1748,7 @@ class API_VM_Metadata(Resource):
|
||||
description: The selector used to determine candidate nodes during migration; see 'target_selector' in the node daemon configuration reference
|
||||
enum:
|
||||
- mem
|
||||
- memfree
|
||||
- memprov
|
||||
- vcpus
|
||||
- load
|
||||
- vms
|
||||
@ -2231,6 +2297,7 @@ api.add_resource(API_VM_Device, "/vm/<vm>/device")
|
||||
# Client API - Network
|
||||
##########################################################
|
||||
|
||||
|
||||
# /network
|
||||
class API_Network_Root(Resource):
|
||||
@RequestParser([{"name": "limit"}])
|
||||
@ -3199,6 +3266,7 @@ api.add_resource(API_Network_ACL_Element, "/network/<vni>/acl/<description>")
|
||||
# Client API - SR-IOV
|
||||
##########################################################
|
||||
|
||||
|
||||
# /sriov
|
||||
class API_SRIOV_Root(Resource):
|
||||
@Authenticator
|
||||
@ -3526,6 +3594,7 @@ api.add_resource(API_SRIOV_VF_Element, "/sriov/vf/<node>/<vf>")
|
||||
# (i.e. it references Ceph-specific concepts), this makes more
|
||||
# sense in the long-term.#
|
||||
|
||||
|
||||
# /storage
|
||||
class API_Storage_Root(Resource):
|
||||
@Authenticator
|
||||
@ -5338,6 +5407,7 @@ api.add_resource(
|
||||
# Provisioner API
|
||||
##########################################################
|
||||
|
||||
|
||||
# /provisioner
|
||||
class API_Provisioner_Root(Resource):
|
||||
@Authenticator
|
||||
|
@ -100,7 +100,7 @@ devices_scsi_controller = """ <controller type='scsi' index='0' model='virtio
|
||||
# * vm_name
|
||||
# * disk_id
|
||||
devices_disk_header = """ <disk type='network' device='disk'>
|
||||
<driver name='qemu' discard='unmap'/>
|
||||
<driver name='qemu' discard='unmap' cache='none'/>
|
||||
<target dev='{disk_id}' bus='scsi'/>
|
||||
<auth username='libvirt'>
|
||||
<secret type='ceph' uuid='{ceph_storage_secret}'/>
|
||||
|
@ -44,6 +44,7 @@ import pvcapid.provisioner as provisioner
|
||||
# Common functions
|
||||
#
|
||||
|
||||
|
||||
# Database connections
|
||||
def open_database(config):
|
||||
conn = psycopg2.connect(
|
||||
|
@ -59,6 +59,7 @@ class ProvisioningError(Exception):
|
||||
# Common functions
|
||||
#
|
||||
|
||||
|
||||
# Database connections
|
||||
def open_database(config):
|
||||
conn = psycopg2.connect(
|
||||
@ -580,7 +581,7 @@ def delete_template_network_element(name, vni):
|
||||
networks, code = list_template_network_vnis(name)
|
||||
found_vni = False
|
||||
for network in networks:
|
||||
if network["vni"] == int(vni):
|
||||
if network["vni"] == vni:
|
||||
found_vni = True
|
||||
if not found_vni:
|
||||
retmsg = {
|
||||
|
33
client-cli-old/pvc.py
Executable file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# pvc.py - PVC client command-line interface (stub testing interface)
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import pvc.pvc
|
||||
|
||||
|
||||
#
|
||||
# Main entry point
|
||||
#
|
||||
def main():
|
||||
return pvc.pvc.cli(obj={})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
0
client-cli-old/pvc/lib/__init__.py
Normal file
@ -27,8 +27,8 @@ from requests_toolbelt.multipart.encoder import (
|
||||
MultipartEncoderMonitor,
|
||||
)
|
||||
|
||||
import pvc.cli_lib.ansiprint as ansiprint
|
||||
from pvc.cli_lib.common import UploadProgressBar, call_api
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
from pvc.lib.common import UploadProgressBar, call_api
|
||||
|
||||
#
|
||||
# Supplemental functions
|
@ -21,8 +21,8 @@
|
||||
|
||||
import json
|
||||
|
||||
import pvc.cli_lib.ansiprint as ansiprint
|
||||
from pvc.cli_lib.common import call_api
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
from pvc.lib.common import call_api
|
||||
|
||||
|
||||
def initialize(config, overwrite=False):
|
||||
@ -125,81 +125,63 @@ def format_info(cluster_information, oformat):
|
||||
return json.dumps(cluster_information, indent=4)
|
||||
|
||||
# Plain formatting, i.e. human-readable
|
||||
if cluster_information["health"] == "Optimal":
|
||||
health_colour = ansiprint.green()
|
||||
elif cluster_information["health"] == "Maintenance":
|
||||
if (
|
||||
cluster_information.get("maintenance") == "true"
|
||||
or cluster_information.get("cluster_health", {}).get("health", "N/A") == "N/A"
|
||||
):
|
||||
health_colour = ansiprint.blue()
|
||||
else:
|
||||
elif cluster_information.get("cluster_health", {}).get("health", 100) > 90:
|
||||
health_colour = ansiprint.green()
|
||||
elif cluster_information.get("cluster_health", {}).get("health", 100) > 50:
|
||||
health_colour = ansiprint.yellow()
|
||||
|
||||
if cluster_information["storage_health"] == "Optimal":
|
||||
storage_health_colour = ansiprint.green()
|
||||
elif cluster_information["storage_health"] == "Maintenance":
|
||||
storage_health_colour = ansiprint.blue()
|
||||
else:
|
||||
storage_health_colour = ansiprint.yellow()
|
||||
health_colour = ansiprint.red()
|
||||
|
||||
ainformation = []
|
||||
|
||||
if oformat == "short":
|
||||
ainformation.append(
|
||||
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Cluster health:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
health_colour,
|
||||
cluster_information["health"],
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
if cluster_information["health_msg"]:
|
||||
for line in cluster_information["health_msg"]:
|
||||
ainformation.append(" > {}".format(line))
|
||||
ainformation.append(
|
||||
"{}Storage health:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
storage_health_colour,
|
||||
cluster_information["storage_health"],
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
if cluster_information["storage_health_msg"]:
|
||||
for line in cluster_information["storage_health_msg"]:
|
||||
ainformation.append(" > {}".format(line))
|
||||
|
||||
return "\n".join(ainformation)
|
||||
|
||||
ainformation.append(
|
||||
"{}PVC cluster status:{}".format(ansiprint.bold(), ansiprint.end())
|
||||
)
|
||||
ainformation.append("")
|
||||
|
||||
health_text = (
|
||||
f"{cluster_information.get('cluster_health', {}).get('health', 'N/A')}"
|
||||
)
|
||||
if health_text != "N/A":
|
||||
health_text += "%"
|
||||
if cluster_information.get("maintenance") == "true":
|
||||
health_text += " (maintenance on)"
|
||||
|
||||
ainformation.append(
|
||||
"{}Cluster health:{} {}{}{}".format(
|
||||
"{}Cluster health:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
health_colour,
|
||||
cluster_information["health"],
|
||||
health_text,
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
if cluster_information["health_msg"]:
|
||||
for line in cluster_information["health_msg"]:
|
||||
ainformation.append(" > {}".format(line))
|
||||
ainformation.append(
|
||||
"{}Storage health:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
storage_health_colour,
|
||||
cluster_information["storage_health"],
|
||||
ansiprint.end(),
|
||||
if cluster_information.get("cluster_health", {}).get("messages"):
|
||||
health_messages = "\n > ".join(
|
||||
sorted(cluster_information["cluster_health"]["messages"])
|
||||
)
|
||||
)
|
||||
if cluster_information["storage_health_msg"]:
|
||||
for line in cluster_information["storage_health_msg"]:
|
||||
ainformation.append(" > {}".format(line))
|
||||
ainformation.append(
|
||||
"{}Health messages:{} > {}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
health_messages,
|
||||
)
|
||||
)
|
||||
else:
|
||||
ainformation.append(
|
||||
"{}Health messages:{} N/A".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
|
||||
if oformat == "short":
|
||||
return "\n".join(ainformation)
|
||||
|
||||
ainformation.append("")
|
||||
ainformation.append(
|
||||
@ -207,6 +189,13 @@ def format_info(cluster_information, oformat):
|
||||
ansiprint.purple(), ansiprint.end(), cluster_information["primary_node"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}PVC version:{} {}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
cluster_information.get("pvc_version", "N/A"),
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Cluster upstream IP:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), cluster_information["upstream_ip"]
|
@ -124,8 +124,8 @@ def call_api(
|
||||
data=None,
|
||||
files=None,
|
||||
):
|
||||
# Set the connect timeout to 3 seconds but extremely long (48 hour) data timeout
|
||||
timeout = (3.05, 172800)
|
||||
# Set the connect timeout to 2 seconds but extremely long (48 hour) data timeout
|
||||
timeout = (2.05, 172800)
|
||||
|
||||
# Craft the URI
|
||||
uri = "{}://{}{}{}".format(
|
@ -20,8 +20,8 @@
|
||||
###############################################################################
|
||||
|
||||
import re
|
||||
import pvc.cli_lib.ansiprint as ansiprint
|
||||
from pvc.cli_lib.common import call_api
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
from pvc.lib.common import call_api
|
||||
|
||||
|
||||
def isValidMAC(macaddr):
|
||||
@ -961,7 +961,9 @@ def format_list_dhcp(dhcp_lease_list):
|
||||
)
|
||||
)
|
||||
|
||||
for dhcp_lease_information in sorted(dhcp_lease_list, key=lambda l: l["hostname"]):
|
||||
for dhcp_lease_information in sorted(
|
||||
dhcp_lease_list, key=lambda lease: lease["hostname"]
|
||||
):
|
||||
dhcp_lease_list_output.append(
|
||||
"{bold}\
|
||||
{lease_hostname: <{lease_hostname_length}} \
|
||||
@ -1059,7 +1061,7 @@ def format_list_acl(acl_list):
|
||||
)
|
||||
|
||||
for acl_information in sorted(
|
||||
acl_list, key=lambda l: l["direction"] + str(l["order"])
|
||||
acl_list, key=lambda acl: acl["direction"] + str(acl["order"])
|
||||
):
|
||||
acl_list_output.append(
|
||||
"{bold}\
|
@ -21,8 +21,8 @@
|
||||
|
||||
import time
|
||||
|
||||
import pvc.cli_lib.ansiprint as ansiprint
|
||||
from pvc.cli_lib.common import call_api
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
from pvc.lib.common import call_api
|
||||
|
||||
|
||||
#
|
||||
@ -215,6 +215,19 @@ def node_list(
|
||||
# Output display functions
|
||||
#
|
||||
def getOutputColours(node_information):
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
if node_health <= 50:
|
||||
health_colour = ansiprint.red()
|
||||
elif node_health <= 90:
|
||||
health_colour = ansiprint.yellow()
|
||||
elif node_health <= 100:
|
||||
health_colour = ansiprint.green()
|
||||
else:
|
||||
health_colour = ansiprint.blue()
|
||||
else:
|
||||
health_colour = ansiprint.blue()
|
||||
|
||||
if node_information["daemon_state"] == "run":
|
||||
daemon_state_colour = ansiprint.green()
|
||||
elif node_information["daemon_state"] == "stop":
|
||||
@ -251,6 +264,7 @@ def getOutputColours(node_information):
|
||||
mem_provisioned_colour = ""
|
||||
|
||||
return (
|
||||
health_colour,
|
||||
daemon_state_colour,
|
||||
coordinator_state_colour,
|
||||
domain_state_colour,
|
||||
@ -261,6 +275,7 @@ def getOutputColours(node_information):
|
||||
|
||||
def format_info(node_information, long_output):
|
||||
(
|
||||
health_colour,
|
||||
daemon_state_colour,
|
||||
coordinator_state_colour,
|
||||
domain_state_colour,
|
||||
@ -273,14 +288,56 @@ def format_info(node_information, long_output):
|
||||
# Basic information
|
||||
ainformation.append(
|
||||
"{}Name:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["name"]
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
node_information["name"],
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}PVC Version:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["pvc_version"]
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
node_information["pvc_version"],
|
||||
)
|
||||
)
|
||||
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
node_health_text = f"{node_health}%"
|
||||
else:
|
||||
node_health_text = node_health
|
||||
ainformation.append(
|
||||
"{}Health:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
health_colour,
|
||||
node_health_text,
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
|
||||
node_health_details = node_information.get("health_details", [])
|
||||
if long_output:
|
||||
node_health_messages = "\n ".join(
|
||||
[f"{plugin['name']}: {plugin['message']}" for plugin in node_health_details]
|
||||
)
|
||||
else:
|
||||
node_health_messages = "\n ".join(
|
||||
[
|
||||
f"{plugin['name']}: {plugin['message']}"
|
||||
for plugin in node_health_details
|
||||
if int(plugin.get("health_delta", 0)) > 0
|
||||
]
|
||||
)
|
||||
|
||||
if len(node_health_messages) > 0:
|
||||
ainformation.append(
|
||||
"{}Health Plugin Details:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_health_messages
|
||||
)
|
||||
)
|
||||
ainformation.append("")
|
||||
|
||||
ainformation.append(
|
||||
"{}Daemon State:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
@ -308,11 +365,6 @@ def format_info(node_information, long_output):
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Active VM Count:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["domains_count"]
|
||||
)
|
||||
)
|
||||
if long_output:
|
||||
ainformation.append("")
|
||||
ainformation.append(
|
||||
@ -331,6 +383,11 @@ def format_info(node_information, long_output):
|
||||
)
|
||||
)
|
||||
ainformation.append("")
|
||||
ainformation.append(
|
||||
"{}Active VM Count:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["domains_count"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Host CPUs:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["vcpu"]["total"]
|
||||
@ -397,6 +454,7 @@ def format_list(node_list, raw):
|
||||
# Determine optimal column widths
|
||||
node_name_length = 5
|
||||
pvc_version_length = 8
|
||||
health_length = 7
|
||||
daemon_state_length = 7
|
||||
coordinator_state_length = 12
|
||||
domain_state_length = 7
|
||||
@ -417,6 +475,15 @@ def format_list(node_list, raw):
|
||||
_pvc_version_length = len(node_information.get("pvc_version", "N/A")) + 1
|
||||
if _pvc_version_length > pvc_version_length:
|
||||
pvc_version_length = _pvc_version_length
|
||||
# node_health column
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
node_health_text = f"{node_health}%"
|
||||
else:
|
||||
node_health_text = node_health
|
||||
_health_length = len(node_health_text) + 1
|
||||
if _health_length > health_length:
|
||||
health_length = _health_length
|
||||
# daemon_state column
|
||||
_daemon_state_length = len(node_information["daemon_state"]) + 1
|
||||
if _daemon_state_length > daemon_state_length:
|
||||
@ -466,7 +533,10 @@ def format_list(node_list, raw):
|
||||
# Format the string (header)
|
||||
node_list_output.append(
|
||||
"{bold}{node_header: <{node_header_length}} {state_header: <{state_header_length}} {resource_header: <{resource_header_length}} {memory_header: <{memory_header_length}}{end_bold}".format(
|
||||
node_header_length=node_name_length + pvc_version_length + 1,
|
||||
node_header_length=node_name_length
|
||||
+ pvc_version_length
|
||||
+ health_length
|
||||
+ 2,
|
||||
state_header_length=daemon_state_length
|
||||
+ coordinator_state_length
|
||||
+ domain_state_length
|
||||
@ -484,7 +554,14 @@ def format_list(node_list, raw):
|
||||
bold=ansiprint.bold(),
|
||||
end_bold=ansiprint.end(),
|
||||
node_header="Nodes "
|
||||
+ "".join(["-" for _ in range(6, node_name_length + pvc_version_length)]),
|
||||
+ "".join(
|
||||
[
|
||||
"-"
|
||||
for _ in range(
|
||||
6, node_name_length + pvc_version_length + health_length + 1
|
||||
)
|
||||
]
|
||||
),
|
||||
state_header="States "
|
||||
+ "".join(
|
||||
[
|
||||
@ -526,12 +603,13 @@ def format_list(node_list, raw):
|
||||
)
|
||||
|
||||
node_list_output.append(
|
||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \
|
||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {node_health: <{health_length}} \
|
||||
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
||||
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
||||
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {node_mem_allocated: <{mem_alloc_length}} {node_mem_provisioned: <{mem_prov_length}}{end_bold}".format(
|
||||
node_name_length=node_name_length,
|
||||
pvc_version_length=pvc_version_length,
|
||||
health_length=health_length,
|
||||
daemon_state_length=daemon_state_length,
|
||||
coordinator_state_length=coordinator_state_length,
|
||||
domain_state_length=domain_state_length,
|
||||
@ -551,6 +629,7 @@ def format_list(node_list, raw):
|
||||
end_colour="",
|
||||
node_name="Name",
|
||||
node_pvc_version="Version",
|
||||
node_health="Health",
|
||||
node_daemon_state="Daemon",
|
||||
node_coordinator_state="Coordinator",
|
||||
node_domain_state="Domain",
|
||||
@ -568,19 +647,28 @@ def format_list(node_list, raw):
|
||||
# Format the string (elements)
|
||||
for node_information in sorted(node_list, key=lambda n: n["name"]):
|
||||
(
|
||||
health_colour,
|
||||
daemon_state_colour,
|
||||
coordinator_state_colour,
|
||||
domain_state_colour,
|
||||
mem_allocated_colour,
|
||||
mem_provisioned_colour,
|
||||
) = getOutputColours(node_information)
|
||||
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
node_health_text = f"{node_health}%"
|
||||
else:
|
||||
node_health_text = node_health
|
||||
|
||||
node_list_output.append(
|
||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} \
|
||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {health_colour}{node_health: <{health_length}}{end_colour} \
|
||||
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
||||
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
||||
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {mem_allocated_colour}{node_mem_allocated: <{mem_alloc_length}}{end_colour} {mem_provisioned_colour}{node_mem_provisioned: <{mem_prov_length}}{end_colour}{end_bold}".format(
|
||||
node_name_length=node_name_length,
|
||||
pvc_version_length=pvc_version_length,
|
||||
health_length=health_length,
|
||||
daemon_state_length=daemon_state_length,
|
||||
coordinator_state_length=coordinator_state_length,
|
||||
domain_state_length=domain_state_length,
|
||||
@ -594,6 +682,7 @@ def format_list(node_list, raw):
|
||||
mem_prov_length=mem_prov_length,
|
||||
bold="",
|
||||
end_bold="",
|
||||
health_colour=health_colour,
|
||||
daemon_state_colour=daemon_state_colour,
|
||||
coordinator_state_colour=coordinator_state_colour,
|
||||
domain_state_colour=domain_state_colour,
|
||||
@ -602,6 +691,7 @@ def format_list(node_list, raw):
|
||||
end_colour=ansiprint.end(),
|
||||
node_name=node_information["name"],
|
||||
node_pvc_version=node_information.get("pvc_version", "N/A"),
|
||||
node_health=node_health_text,
|
||||
node_daemon_state=node_information["daemon_state"],
|
||||
node_coordinator_state=node_information["coordinator_state"],
|
||||
node_domain_state=node_information["domain_state"],
|
@ -24,8 +24,8 @@ from requests_toolbelt.multipart.encoder import (
|
||||
MultipartEncoderMonitor,
|
||||
)
|
||||
|
||||
import pvc.cli_lib.ansiprint as ansiprint
|
||||
from pvc.cli_lib.common import UploadProgressBar, call_api
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
from pvc.lib.common import UploadProgressBar, call_api
|
||||
from ast import literal_eval
|
||||
|
||||
|
@ -22,8 +22,8 @@
|
||||
import time
|
||||
import re
|
||||
|
||||
import pvc.cli_lib.ansiprint as ansiprint
|
||||
from pvc.cli_lib.common import call_api, format_bytes, format_metric
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
from pvc.lib.common import call_api, format_bytes, format_metric
|
||||
|
||||
|
||||
#
|
||||
@ -677,7 +677,7 @@ def vm_networks_add(
|
||||
from lxml.objectify import fromstring
|
||||
from lxml.etree import tostring
|
||||
from random import randint
|
||||
import pvc.cli_lib.network as pvc_network
|
||||
import pvc.lib.network as pvc_network
|
||||
|
||||
network_exists, _ = pvc_network.net_info(config, network)
|
||||
if not network_exists:
|
||||
@ -1046,7 +1046,7 @@ def vm_volumes_add(config, vm, volume, disk_id, bus, disk_type, live, restart):
|
||||
from lxml.objectify import fromstring
|
||||
from lxml.etree import tostring
|
||||
from copy import deepcopy
|
||||
import pvc.cli_lib.ceph as pvc_ceph
|
||||
import pvc.lib.ceph as pvc_ceph
|
||||
|
||||
if disk_type == "rbd":
|
||||
# Verify that the provided volume is valid
|
@ -37,13 +37,13 @@ from distutils.util import strtobool
|
||||
|
||||
from functools import wraps
|
||||
|
||||
import pvc.cli_lib.ansiprint as ansiprint
|
||||
import pvc.cli_lib.cluster as pvc_cluster
|
||||
import pvc.cli_lib.node as pvc_node
|
||||
import pvc.cli_lib.vm as pvc_vm
|
||||
import pvc.cli_lib.network as pvc_network
|
||||
import pvc.cli_lib.ceph as pvc_ceph
|
||||
import pvc.cli_lib.provisioner as pvc_provisioner
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
import pvc.lib.cluster as pvc_cluster
|
||||
import pvc.lib.node as pvc_node
|
||||
import pvc.lib.vm as pvc_vm
|
||||
import pvc.lib.network as pvc_network
|
||||
import pvc.lib.ceph as pvc_ceph
|
||||
import pvc.lib.provisioner as pvc_provisioner
|
||||
|
||||
|
||||
myhostname = socket.gethostname().split(".")[0]
|
||||
@ -134,7 +134,7 @@ def get_config(store_data, cluster=None):
|
||||
config = dict()
|
||||
config["debug"] = False
|
||||
config["cluster"] = cluster
|
||||
config["desctription"] = description
|
||||
config["description"] = description
|
||||
config["api_host"] = "{}:{}".format(host, port)
|
||||
config["api_scheme"] = scheme
|
||||
config["api_key"] = api_key
|
||||
@ -382,8 +382,6 @@ def cluster_list(raw):
|
||||
|
||||
if not raw:
|
||||
# Display the data nicely
|
||||
echo("Available clusters:")
|
||||
echo("")
|
||||
echo(
|
||||
"{bold}{name: <{name_length}} {description: <{description_length}} {address: <{address_length}} {port: <{port_length}} {scheme: <{scheme_length}} {api_key: <{api_key_length}}{end_bold}".format(
|
||||
bold=ansiprint.bold(),
|
||||
@ -443,6 +441,230 @@ def cluster_list(raw):
|
||||
echo(cluster)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# pvc cluster detail
|
||||
###############################################################################
|
||||
@click.command(name="detail", short_help="Show details of all available clusters.")
|
||||
def cluster_detail():
|
||||
"""
|
||||
Show quick details of all PVC clusters configured in this CLI instance.
|
||||
"""
|
||||
|
||||
# Get the existing data
|
||||
clusters = get_store(store_path)
|
||||
|
||||
cluster_details_list = list()
|
||||
|
||||
echo("Gathering information from clusters... ", nl=False)
|
||||
|
||||
for cluster in clusters:
|
||||
_store_data = get_store(store_path)
|
||||
cluster_config = get_config(_store_data, cluster=cluster)
|
||||
retcode, retdata = pvc_cluster.get_info(cluster_config)
|
||||
if retcode == 0:
|
||||
retdata = None
|
||||
cluster_details = {"config": cluster_config, "data": retdata}
|
||||
cluster_details_list.append(cluster_details)
|
||||
|
||||
echo("done.")
|
||||
echo("")
|
||||
|
||||
# Find the lengths of each column
|
||||
name_length = 5
|
||||
description_length = 12
|
||||
health_length = 7
|
||||
primary_node_length = 8
|
||||
pvc_version_length = 8
|
||||
nodes_length = 6
|
||||
vms_length = 4
|
||||
networks_length = 9
|
||||
osds_length = 5
|
||||
pools_length = 6
|
||||
volumes_length = 8
|
||||
snapshots_length = 10
|
||||
|
||||
for cluster_details in cluster_details_list:
|
||||
_name_length = len(cluster_details["config"]["cluster"]) + 1
|
||||
if _name_length > name_length:
|
||||
name_length = _name_length
|
||||
|
||||
_description_length = len(cluster_details["config"]["description"]) + 1
|
||||
if _description_length > description_length:
|
||||
description_length = _description_length
|
||||
|
||||
if cluster_details["data"] is None:
|
||||
continue
|
||||
|
||||
_health_length = (
|
||||
len(
|
||||
str(
|
||||
cluster_details["data"]
|
||||
.get("cluster_health", {})
|
||||
.get("health", "N/A")
|
||||
)
|
||||
+ "%"
|
||||
)
|
||||
+ 1
|
||||
)
|
||||
if _health_length > health_length:
|
||||
health_length = _health_length
|
||||
|
||||
_primary_node_length = len(cluster_details["data"]["primary_node"]) + 1
|
||||
if _primary_node_length > primary_node_length:
|
||||
primary_node_length = _primary_node_length
|
||||
|
||||
_pvc_version_length = (
|
||||
len(cluster_details["data"].get("pvc_version", "< 0.9.62")) + 1
|
||||
)
|
||||
if _pvc_version_length > pvc_version_length:
|
||||
pvc_version_length = _pvc_version_length
|
||||
|
||||
_nodes_length = len(str(cluster_details["data"]["nodes"]["total"])) + 1
|
||||
if _nodes_length > nodes_length:
|
||||
nodes_length = _nodes_length
|
||||
|
||||
_vms_length = len(str(cluster_details["data"]["vms"]["total"])) + 1
|
||||
if _vms_length > vms_length:
|
||||
vms_length = _vms_length
|
||||
|
||||
_networks_length = len(str(cluster_details["data"]["networks"])) + 1
|
||||
if _networks_length > networks_length:
|
||||
networks_length = _networks_length
|
||||
|
||||
_osds_length = len(str(cluster_details["data"]["osds"]["total"])) + 1
|
||||
if _osds_length > osds_length:
|
||||
osds_length = _osds_length
|
||||
|
||||
_pools_length = len(str(cluster_details["data"]["pools"])) + 1
|
||||
if _pools_length > pools_length:
|
||||
pools_length = _pools_length
|
||||
|
||||
_volumes_length = len(str(cluster_details["data"]["volumes"])) + 1
|
||||
if _volumes_length > volumes_length:
|
||||
volumes_length = _volumes_length
|
||||
|
||||
_snapshots_length = len(str(cluster_details["data"]["snapshots"])) + 1
|
||||
if _snapshots_length > snapshots_length:
|
||||
snapshots_length = _snapshots_length
|
||||
|
||||
# Display the data nicely
|
||||
echo(
|
||||
"{bold}{name: <{name_length}} {description: <{description_length}} {health: <{health_length}} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}{end_bold}".format(
|
||||
bold=ansiprint.bold(),
|
||||
end_bold=ansiprint.end(),
|
||||
name="Name",
|
||||
name_length=name_length,
|
||||
description="Description",
|
||||
description_length=description_length,
|
||||
health="Health",
|
||||
health_length=health_length,
|
||||
primary_node="Primary",
|
||||
primary_node_length=primary_node_length,
|
||||
pvc_version="Version",
|
||||
pvc_version_length=pvc_version_length,
|
||||
nodes="Nodes",
|
||||
nodes_length=nodes_length,
|
||||
vms="VMs",
|
||||
vms_length=vms_length,
|
||||
networks="Networks",
|
||||
networks_length=networks_length,
|
||||
osds="OSDs",
|
||||
osds_length=osds_length,
|
||||
pools="Pools",
|
||||
pools_length=pools_length,
|
||||
volumes="Volumes",
|
||||
volumes_length=volumes_length,
|
||||
snapshots="Snapshots",
|
||||
snapshots_length=snapshots_length,
|
||||
)
|
||||
)
|
||||
|
||||
for cluster_details in cluster_details_list:
|
||||
if cluster_details["data"] is None:
|
||||
health_colour = ansiprint.blue()
|
||||
name = cluster_details["config"]["cluster"]
|
||||
description = cluster_details["config"]["description"]
|
||||
health = "N/A"
|
||||
primary_node = "N/A"
|
||||
pvc_version = "N/A"
|
||||
nodes = "N/A"
|
||||
vms = "N/A"
|
||||
networks = "N/A"
|
||||
osds = "N/A"
|
||||
pools = "N/A"
|
||||
volumes = "N/A"
|
||||
snapshots = "N/A"
|
||||
else:
|
||||
if (
|
||||
cluster_details["data"].get("maintenance") == "true"
|
||||
or cluster_details["data"]
|
||||
.get("cluster_health", {})
|
||||
.get("health", "N/A")
|
||||
== "N/A"
|
||||
):
|
||||
health_colour = ansiprint.blue()
|
||||
elif (
|
||||
cluster_details["data"].get("cluster_health", {}).get("health", 100)
|
||||
> 90
|
||||
):
|
||||
health_colour = ansiprint.green()
|
||||
elif (
|
||||
cluster_details["data"].get("cluster_health", {}).get("health", 100)
|
||||
> 50
|
||||
):
|
||||
health_colour = ansiprint.yellow()
|
||||
else:
|
||||
health_colour = ansiprint.red()
|
||||
|
||||
name = cluster_details["config"]["cluster"]
|
||||
description = cluster_details["config"]["description"]
|
||||
health = str(
|
||||
cluster_details["data"].get("cluster_health", {}).get("health", "N/A")
|
||||
)
|
||||
if health != "N/A":
|
||||
health += "%"
|
||||
primary_node = cluster_details["data"]["primary_node"]
|
||||
pvc_version = cluster_details["data"].get("pvc_version", "< 0.9.62")
|
||||
nodes = str(cluster_details["data"]["nodes"]["total"])
|
||||
vms = str(cluster_details["data"]["vms"]["total"])
|
||||
networks = str(cluster_details["data"]["networks"])
|
||||
osds = str(cluster_details["data"]["osds"]["total"])
|
||||
pools = str(cluster_details["data"]["pools"])
|
||||
volumes = str(cluster_details["data"]["volumes"])
|
||||
snapshots = str(cluster_details["data"]["snapshots"])
|
||||
|
||||
echo(
|
||||
"{name: <{name_length}} {description: <{description_length}} {health_colour}{health: <{health_length}}{end_colour} {primary_node: <{primary_node_length}} {pvc_version: <{pvc_version_length}} {nodes: <{nodes_length}} {vms: <{vms_length}} {networks: <{networks_length}} {osds: <{osds_length}} {pools: <{pools_length}} {volumes: <{volumes_length}} {snapshots: <{snapshots_length}}".format(
|
||||
health_colour=health_colour,
|
||||
end_colour=ansiprint.end(),
|
||||
name=name,
|
||||
name_length=name_length,
|
||||
description=description,
|
||||
description_length=description_length,
|
||||
health=health,
|
||||
health_length=health_length,
|
||||
primary_node=primary_node,
|
||||
primary_node_length=primary_node_length,
|
||||
pvc_version=pvc_version,
|
||||
pvc_version_length=pvc_version_length,
|
||||
nodes=nodes,
|
||||
nodes_length=nodes_length,
|
||||
vms=vms,
|
||||
vms_length=vms_length,
|
||||
networks=networks,
|
||||
networks_length=networks_length,
|
||||
osds=osds,
|
||||
osds_length=osds_length,
|
||||
pools=pools,
|
||||
pools_length=pools_length,
|
||||
volumes=volumes,
|
||||
volumes_length=volumes_length,
|
||||
snapshots=snapshots,
|
||||
snapshots_length=snapshots_length,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# Validate that the cluster is set for a given command
|
||||
def cluster_req(function):
|
||||
@wraps(function)
|
||||
@ -452,6 +674,24 @@ def cluster_req(function):
|
||||
'No cluster specified and no local pvcapid.yaml configuration found. Use "pvc cluster" to add a cluster API to connect to.'
|
||||
)
|
||||
exit(1)
|
||||
|
||||
if not config["quiet"]:
|
||||
if config["api_scheme"] == "https" and not config["verify_ssl"]:
|
||||
ssl_unverified_msg = " (unverified)"
|
||||
else:
|
||||
ssl_unverified_msg = ""
|
||||
echo(
|
||||
'Using cluster "{}" - Host: "{}" Scheme: "{}{}" Prefix: "{}"'.format(
|
||||
config["cluster"],
|
||||
config["api_host"],
|
||||
config["api_scheme"],
|
||||
ssl_unverified_msg,
|
||||
config["api_prefix"],
|
||||
),
|
||||
err=True,
|
||||
)
|
||||
echo("", err=True)
|
||||
|
||||
return function(*args, **kwargs)
|
||||
|
||||
return validate_cluster
|
||||
@ -697,15 +937,29 @@ def node_log(node, lines, follow):
|
||||
default=False,
|
||||
help="Display more detailed information.",
|
||||
)
|
||||
@click.option(
|
||||
"-f",
|
||||
"--format",
|
||||
"oformat",
|
||||
default="plain",
|
||||
show_default=True,
|
||||
type=click.Choice(["plain", "json", "json-pretty"]),
|
||||
help="Output format of node status information.",
|
||||
)
|
||||
@cluster_req
|
||||
def node_info(node, long_output):
|
||||
def node_info(node, long_output, oformat):
|
||||
"""
|
||||
Show information about node NODE. If unspecified, defaults to this host.
|
||||
"""
|
||||
|
||||
retcode, retdata = pvc_node.node_info(config, node)
|
||||
if retcode:
|
||||
retdata = pvc_node.format_info(retdata, long_output)
|
||||
if oformat == "json":
|
||||
retdata = json.dumps(retdata)
|
||||
elif oformat == "json-pretty":
|
||||
retdata = json.dumps(retdata, indent=4)
|
||||
else:
|
||||
retdata = pvc_node.format_info(retdata, long_output)
|
||||
cleanup(retcode, retdata)
|
||||
|
||||
|
||||
@ -807,7 +1061,7 @@ def cli_vm():
|
||||
"node_selector",
|
||||
default="none",
|
||||
show_default=True,
|
||||
type=click.Choice(["mem", "memfree", "load", "vcpus", "vms", "none"]),
|
||||
type=click.Choice(["mem", "memprov", "load", "vcpus", "vms", "none"]),
|
||||
help='Method to determine optimal target node during autoselect; "none" will use the default for the cluster.',
|
||||
)
|
||||
@click.option(
|
||||
@ -859,15 +1113,15 @@ def vm_define(
|
||||
Define a new virtual machine from Libvirt XML configuration file VMCONFIG.
|
||||
|
||||
The target node selector ("--node-selector"/"-s") can be "none" to use the cluster default, or one of the following values:
|
||||
* "mem": choose the node with the least provisioned VM memory
|
||||
* "memfree": choose the node with the most (real) free memory
|
||||
* "mem": choose the node with the most (real) free memory
|
||||
* "memprov": choose the node with the least provisioned VM memory
|
||||
* "vcpus": choose the node with the least allocated VM vCPUs
|
||||
* "load": choose the node with the lowest current load average
|
||||
* "vms": choose the node with the least number of provisioned VMs
|
||||
|
||||
For most clusters, "mem" should be sufficient, but others may be used based on the cluster workload and available resources. The following caveats should be considered:
|
||||
* "mem" looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use.
|
||||
* "memfree" looks at the free memory of the node in general, ignoring the amount provisioned to VMs; if any VM's internal memory usage changes, this value would be affected. This might be preferable to "mem" on clusters with very high memory utilization versus total capacity or if many VMs are stopped/disabled.
|
||||
* "mem" looks at the free memory of the node in general, ignoring the amount provisioned to VMs; if any VM's internal memory usage changes, this value would be affected.
|
||||
* "memprov" looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use.
|
||||
* "load" looks at the system load of the node in general, ignoring load in any particular VMs; if any VM's CPU usage changes, this value would be affected. This might be preferable on clusters with some very CPU intensive VMs.
|
||||
"""
|
||||
|
||||
@ -914,7 +1168,7 @@ def vm_define(
|
||||
"node_selector",
|
||||
default=None,
|
||||
show_default=False,
|
||||
type=click.Choice(["mem", "memfree", "load", "vcpus", "vms", "none"]),
|
||||
type=click.Choice(["mem", "memprov", "load", "vcpus", "vms", "none"]),
|
||||
help='Method to determine optimal target node during autoselect; "none" will use the default for the cluster.',
|
||||
)
|
||||
@click.option(
|
||||
@ -4134,7 +4388,7 @@ def provisioner_template_system_list(limit):
|
||||
"--node-selector",
|
||||
"node_selector",
|
||||
type=click.Choice(
|
||||
["mem", "memfree", "vcpus", "vms", "load", "none"], case_sensitive=False
|
||||
["mem", "memprov", "vcpus", "vms", "load", "none"], case_sensitive=False
|
||||
),
|
||||
default="none",
|
||||
help='Method to determine optimal target node during autoselect; "none" will use the default for the cluster.',
|
||||
@ -4230,7 +4484,7 @@ def provisioner_template_system_add(
|
||||
"--node-selector",
|
||||
"node_selector",
|
||||
type=click.Choice(
|
||||
["mem", "memfree", "vcpus", "vms", "load", "none"], case_sensitive=False
|
||||
["mem", "memprov", "vcpus", "vms", "load", "none"], case_sensitive=False
|
||||
),
|
||||
help='Method to determine optimal target node during autoselect; "none" will use the default for the cluster.',
|
||||
)
|
||||
@ -5882,23 +6136,7 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour):
|
||||
config["debug"] = _debug
|
||||
config["unsafe"] = _unsafe
|
||||
config["colour"] = _colour
|
||||
|
||||
if not _quiet:
|
||||
if config["api_scheme"] == "https" and not config["verify_ssl"]:
|
||||
ssl_unverified_msg = " (unverified)"
|
||||
else:
|
||||
ssl_unverified_msg = ""
|
||||
echo(
|
||||
'Using cluster "{}" - Host: "{}" Scheme: "{}{}" Prefix: "{}"'.format(
|
||||
config["cluster"],
|
||||
config["api_host"],
|
||||
config["api_scheme"],
|
||||
ssl_unverified_msg,
|
||||
config["api_prefix"],
|
||||
),
|
||||
err=True,
|
||||
)
|
||||
echo("", err=True)
|
||||
config["quiet"] = _quiet
|
||||
|
||||
audit()
|
||||
|
||||
@ -5909,6 +6147,7 @@ def cli(_cluster, _debug, _quiet, _unsafe, _colour):
|
||||
cli_cluster.add_command(cluster_add)
|
||||
cli_cluster.add_command(cluster_remove)
|
||||
cli_cluster.add_command(cluster_list)
|
||||
cli_cluster.add_command(cluster_detail)
|
||||
|
||||
cli_node.add_command(node_secondary)
|
||||
cli_node.add_command(node_primary)
|
20
client-cli-old/setup.py
Normal file
@ -0,0 +1,20 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="pvc",
|
||||
version="0.9.63",
|
||||
packages=["pvc", "pvc.lib"],
|
||||
install_requires=[
|
||||
"Click",
|
||||
"PyYAML",
|
||||
"lxml",
|
||||
"colorama",
|
||||
"requests",
|
||||
"requests-toolbelt",
|
||||
],
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"pvc = pvc.pvc:cli",
|
||||
],
|
||||
},
|
||||
)
|
33
client-cli/pvc.py
Executable file
@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# pvc.py - PVC client command-line interface (stub testing interface)
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from pvc.cli.cli import cli
|
||||
|
||||
|
||||
#
|
||||
# Main entry point
|
||||
#
|
||||
def main():
|
||||
return cli(obj={})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
5808
client-cli/pvc/cli/cli.py
Normal file
734
client-cli/pvc/cli/formatters.py
Normal file
@ -0,0 +1,734 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# formatters.py - PVC Click CLI output formatters library
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from pvc.lib.node import format_info as node_format_info
|
||||
from pvc.lib.node import format_list as node_format_list
|
||||
from pvc.lib.vm import format_vm_tags as vm_format_tags
|
||||
from pvc.lib.vm import format_vm_vcpus as vm_format_vcpus
|
||||
from pvc.lib.vm import format_vm_memory as vm_format_memory
|
||||
from pvc.lib.vm import format_vm_networks as vm_format_networks
|
||||
from pvc.lib.vm import format_vm_volumes as vm_format_volumes
|
||||
from pvc.lib.vm import format_info as vm_format_info
|
||||
from pvc.lib.vm import format_list as vm_format_list
|
||||
from pvc.lib.network import format_info as network_format_info
|
||||
from pvc.lib.network import format_list as network_format_list
|
||||
from pvc.lib.network import format_list_dhcp as network_format_dhcp_list
|
||||
from pvc.lib.network import format_list_acl as network_format_acl_list
|
||||
from pvc.lib.network import format_list_sriov_pf as network_format_sriov_pf_list
|
||||
from pvc.lib.network import format_info_sriov_vf as network_format_sriov_vf_info
|
||||
from pvc.lib.network import format_list_sriov_vf as network_format_sriov_vf_list
|
||||
from pvc.lib.storage import format_raw_output as storage_format_raw
|
||||
from pvc.lib.storage import format_info_benchmark as storage_format_benchmark_info
|
||||
from pvc.lib.storage import format_list_benchmark as storage_format_benchmark_list
|
||||
from pvc.lib.storage import format_list_osd as storage_format_osd_list
|
||||
from pvc.lib.storage import format_list_pool as storage_format_pool_list
|
||||
from pvc.lib.storage import format_list_volume as storage_format_volume_list
|
||||
from pvc.lib.storage import format_list_snapshot as storage_format_snapshot_list
|
||||
from pvc.lib.provisioner import format_list_template as provisioner_format_template_list
|
||||
from pvc.lib.provisioner import format_list_userdata as provisioner_format_userdata_list
|
||||
from pvc.lib.provisioner import format_list_script as provisioner_format_script_list
|
||||
from pvc.lib.provisioner import format_list_ova as provisioner_format_ova_list
|
||||
from pvc.lib.provisioner import format_list_profile as provisioner_format_profile_list
|
||||
from pvc.lib.provisioner import format_list_task as provisioner_format_task_status
|
||||
|
||||
|
||||
# Define colour values for use in formatters
|
||||
ansii = {
|
||||
"red": "\033[91m",
|
||||
"blue": "\033[94m",
|
||||
"cyan": "\033[96m",
|
||||
"green": "\033[92m",
|
||||
"yellow": "\033[93m",
|
||||
"purple": "\033[95m",
|
||||
"bold": "\033[1m",
|
||||
"end": "\033[0m",
|
||||
}
|
||||
|
||||
|
||||
def cli_cluster_status_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the full output of cli_cluster_status
|
||||
"""
|
||||
|
||||
# Normalize data to local variables
|
||||
health = data.get("cluster_health", {}).get("health", -1)
|
||||
messages = data.get("cluster_health", {}).get("messages", None)
|
||||
maintenance = data.get("maintenance", "N/A")
|
||||
primary_node = data.get("primary_node", "N/A")
|
||||
pvc_version = data.get("pvc_version", "N/A")
|
||||
upstream_ip = data.get("upstream_ip", "N/A")
|
||||
total_nodes = data.get("nodes", {}).get("total", 0)
|
||||
total_vms = data.get("vms", {}).get("total", 0)
|
||||
total_networks = data.get("networks", 0)
|
||||
total_osds = data.get("osds", {}).get("total", 0)
|
||||
total_pools = data.get("pools", 0)
|
||||
total_volumes = data.get("volumes", 0)
|
||||
total_snapshots = data.get("snapshots", 0)
|
||||
|
||||
if maintenance == "true" or health == -1:
|
||||
health_colour = ansii["blue"]
|
||||
elif health > 90:
|
||||
health_colour = ansii["green"]
|
||||
elif health > 50:
|
||||
health_colour = ansii["yellow"]
|
||||
else:
|
||||
health_colour = ansii["red"]
|
||||
|
||||
output = list()
|
||||
|
||||
output.append(f"{ansii['bold']}PVC cluster status:{ansii['end']}")
|
||||
output.append("")
|
||||
|
||||
if health != "-1":
|
||||
health = f"{health}%"
|
||||
else:
|
||||
health = "N/A"
|
||||
|
||||
if maintenance == "true":
|
||||
health = f"{health} (maintenance on)"
|
||||
|
||||
output.append(
|
||||
f"{ansii['purple']}Cluster health:{ansii['end']} {health_colour}{health}{ansii['end']}"
|
||||
)
|
||||
|
||||
if messages is not None and len(messages) > 0:
|
||||
messages = "\n ".join(sorted(messages))
|
||||
output.append(f"{ansii['purple']}Health messages:{ansii['end']} {messages}")
|
||||
|
||||
output.append("")
|
||||
|
||||
output.append(f"{ansii['purple']}Primary node:{ansii['end']} {primary_node}")
|
||||
output.append(f"{ansii['purple']}PVC version:{ansii['end']} {pvc_version}")
|
||||
output.append(f"{ansii['purple']}Upstream IP:{ansii['end']} {upstream_ip}")
|
||||
output.append("")
|
||||
|
||||
node_states = ["run,ready"]
|
||||
node_states.extend(
|
||||
[
|
||||
state
|
||||
for state in data.get("nodes", {}).keys()
|
||||
if state not in ["total", "run,ready"]
|
||||
]
|
||||
)
|
||||
|
||||
nodes_strings = list()
|
||||
for state in node_states:
|
||||
if state in ["run,ready"]:
|
||||
state_colour = ansii["green"]
|
||||
elif state in ["run,flush", "run,unflush", "run,flushed"]:
|
||||
state_colour = ansii["blue"]
|
||||
elif "dead" in state or "fenced" in state or "stop" in state:
|
||||
state_colour = ansii["red"]
|
||||
else:
|
||||
state_colour = ansii["yellow"]
|
||||
|
||||
nodes_strings.append(
|
||||
f"{data.get('nodes', {}).get(state)}/{total_nodes} {state_colour}{state}{ansii['end']}"
|
||||
)
|
||||
|
||||
nodes_string = ", ".join(nodes_strings)
|
||||
|
||||
output.append(f"{ansii['purple']}Nodes:{ansii['end']} {nodes_string}")
|
||||
|
||||
vm_states = ["start", "disable"]
|
||||
vm_states.extend(
|
||||
[
|
||||
state
|
||||
for state in data.get("vms", {}).keys()
|
||||
if state not in ["total", "start", "disable"]
|
||||
]
|
||||
)
|
||||
|
||||
vms_strings = list()
|
||||
for state in vm_states:
|
||||
if data.get("vms", {}).get(state) is None:
|
||||
continue
|
||||
if state in ["start"]:
|
||||
state_colour = ansii["green"]
|
||||
elif state in ["migrate", "disable", "provision"]:
|
||||
state_colour = ansii["blue"]
|
||||
elif state in ["stop", "fail"]:
|
||||
state_colour = ansii["red"]
|
||||
else:
|
||||
state_colour = ansii["yellow"]
|
||||
|
||||
vms_strings.append(
|
||||
f"{data.get('vms', {}).get(state)}/{total_vms} {state_colour}{state}{ansii['end']}"
|
||||
)
|
||||
|
||||
vms_string = ", ".join(vms_strings)
|
||||
|
||||
output.append(f"{ansii['purple']}VMs:{ansii['end']} {vms_string}")
|
||||
|
||||
osd_states = ["up,in"]
|
||||
osd_states.extend(
|
||||
[
|
||||
state
|
||||
for state in data.get("osds", {}).keys()
|
||||
if state not in ["total", "up,in"]
|
||||
]
|
||||
)
|
||||
|
||||
osds_strings = list()
|
||||
for state in osd_states:
|
||||
if state in ["up,in"]:
|
||||
state_colour = ansii["green"]
|
||||
elif state in ["down,out"]:
|
||||
state_colour = ansii["red"]
|
||||
else:
|
||||
state_colour = ansii["yellow"]
|
||||
|
||||
osds_strings.append(
|
||||
f"{data.get('osds', {}).get(state)}/{total_osds} {state_colour}{state}{ansii['end']}"
|
||||
)
|
||||
|
||||
osds_string = " ".join(osds_strings)
|
||||
|
||||
output.append(f"{ansii['purple']}OSDs:{ansii['end']} {osds_string}")
|
||||
|
||||
output.append(f"{ansii['purple']}Pools:{ansii['end']} {total_pools}")
|
||||
|
||||
output.append(f"{ansii['purple']}Volumes:{ansii['end']} {total_volumes}")
|
||||
|
||||
output.append(f"{ansii['purple']}Snapshots:{ansii['end']} {total_snapshots}")
|
||||
|
||||
output.append(f"{ansii['purple']}Networks:{ansii['end']} {total_networks}")
|
||||
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def cli_cluster_status_format_short(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the health-only output of cli_cluster_status
|
||||
"""
|
||||
|
||||
# Normalize data to local variables
|
||||
health = data.get("cluster_health", {}).get("health", -1)
|
||||
messages = data.get("cluster_health", {}).get("messages", None)
|
||||
maintenance = data.get("maintenance", "N/A")
|
||||
|
||||
if maintenance == "true" or health == -1:
|
||||
health_colour = ansii["blue"]
|
||||
elif health > 90:
|
||||
health_colour = ansii["green"]
|
||||
elif health > 50:
|
||||
health_colour = ansii["yellow"]
|
||||
else:
|
||||
health_colour = ansii["red"]
|
||||
|
||||
output = list()
|
||||
|
||||
output.append(f"{ansii['bold']}PVC cluster status:{ansii['end']}")
|
||||
output.append("")
|
||||
|
||||
if health != "-1":
|
||||
health = f"{health}%"
|
||||
else:
|
||||
health = "N/A"
|
||||
|
||||
if maintenance == "true":
|
||||
health = f"{health} (maintenance on)"
|
||||
|
||||
output.append(
|
||||
f"{ansii['purple']}Cluster health:{ansii['end']} {health_colour}{health}{ansii['end']}"
|
||||
)
|
||||
|
||||
if messages is not None and len(messages) > 0:
|
||||
messages = "\n ".join(sorted(messages))
|
||||
output.append(f"{ansii['purple']}Health messages:{ansii['end']} {messages}")
|
||||
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def cli_connection_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_connection_list
|
||||
"""
|
||||
|
||||
# Set the fields data
|
||||
fields = {
|
||||
"name": {"header": "Name", "length": len("Name") + 1},
|
||||
"description": {"header": "Description", "length": len("Description") + 1},
|
||||
"address": {"header": "Address", "length": len("Address") + 1},
|
||||
"port": {"header": "Port", "length": len("Port") + 1},
|
||||
"scheme": {"header": "Scheme", "length": len("Scheme") + 1},
|
||||
"api_key": {"header": "API Key", "length": len("API Key") + 1},
|
||||
}
|
||||
|
||||
# Parse each connection and adjust field lengths
|
||||
for connection in data:
|
||||
for field, length in [(f, fields[f]["length"]) for f in fields]:
|
||||
_length = len(str(connection[field]))
|
||||
if _length > length:
|
||||
length = len(str(connection[field])) + 1
|
||||
|
||||
fields[field]["length"] = length
|
||||
|
||||
# Create the output object and define the line format
|
||||
output = list()
|
||||
line = "{bold}{name: <{lname}} {desc: <{ldesc}} {addr: <{laddr}} {port: <{lport}} {schm: <{lschm}} {akey: <{lakey}}{end}"
|
||||
|
||||
# Add the header line
|
||||
output.append(
|
||||
line.format(
|
||||
bold=ansii["bold"],
|
||||
end=ansii["end"],
|
||||
name=fields["name"]["header"],
|
||||
lname=fields["name"]["length"],
|
||||
desc=fields["description"]["header"],
|
||||
ldesc=fields["description"]["length"],
|
||||
addr=fields["address"]["header"],
|
||||
laddr=fields["address"]["length"],
|
||||
port=fields["port"]["header"],
|
||||
lport=fields["port"]["length"],
|
||||
schm=fields["scheme"]["header"],
|
||||
lschm=fields["scheme"]["length"],
|
||||
akey=fields["api_key"]["header"],
|
||||
lakey=fields["api_key"]["length"],
|
||||
)
|
||||
)
|
||||
|
||||
# Add a line per connection
|
||||
for connection in data:
|
||||
output.append(
|
||||
line.format(
|
||||
bold="",
|
||||
end="",
|
||||
name=connection["name"],
|
||||
lname=fields["name"]["length"],
|
||||
desc=connection["description"],
|
||||
ldesc=fields["description"]["length"],
|
||||
addr=connection["address"],
|
||||
laddr=fields["address"]["length"],
|
||||
port=connection["port"],
|
||||
lport=fields["port"]["length"],
|
||||
schm=connection["scheme"],
|
||||
lschm=fields["scheme"]["length"],
|
||||
akey=connection["api_key"],
|
||||
lakey=fields["api_key"]["length"],
|
||||
)
|
||||
)
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def cli_connection_detail_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_connection_detail
|
||||
"""
|
||||
|
||||
# Set the fields data
|
||||
fields = {
|
||||
"name": {"header": "Name", "length": len("Name") + 1},
|
||||
"description": {"header": "Description", "length": len("Description") + 1},
|
||||
"health": {"header": "Health", "length": len("Health") + 1},
|
||||
"primary_node": {"header": "Primary", "length": len("Primary") + 1},
|
||||
"pvc_version": {"header": "Version", "length": len("Version") + 1},
|
||||
"nodes": {"header": "Nodes", "length": len("Nodes") + 1},
|
||||
"vms": {"header": "VMs", "length": len("VMs") + 1},
|
||||
"networks": {"header": "Networks", "length": len("Networks") + 1},
|
||||
"osds": {"header": "OSDs", "length": len("OSDs") + 1},
|
||||
"pools": {"header": "Pools", "length": len("Pools") + 1},
|
||||
"volumes": {"header": "Volumes", "length": len("Volumes") + 1},
|
||||
"snapshots": {"header": "Snapshots", "length": len("Snapshots") + 1},
|
||||
}
|
||||
|
||||
# Parse each connection and adjust field lengths
|
||||
for connection in data:
|
||||
for field, length in [(f, fields[f]["length"]) for f in fields]:
|
||||
_length = len(str(connection[field]))
|
||||
if _length > length:
|
||||
length = len(str(connection[field])) + 1
|
||||
|
||||
fields[field]["length"] = length
|
||||
|
||||
# Create the output object and define the line format
|
||||
output = list()
|
||||
line = "{bold}{name: <{lname}} {desc: <{ldesc}} {chlth}{hlth: <{lhlth}}{endc} {prin: <{lprin}} {vers: <{lvers}} {nods: <{lnods}} {vms: <{lvms}} {nets: <{lnets}} {osds: <{losds}} {pols: <{lpols}} {vols: <{lvols}} {snts: <{lsnts}}{end}"
|
||||
|
||||
# Add the header line
|
||||
output.append(
|
||||
line.format(
|
||||
bold=ansii["bold"],
|
||||
end=ansii["end"],
|
||||
chlth="",
|
||||
endc="",
|
||||
name=fields["name"]["header"],
|
||||
lname=fields["name"]["length"],
|
||||
desc=fields["description"]["header"],
|
||||
ldesc=fields["description"]["length"],
|
||||
hlth=fields["health"]["header"],
|
||||
lhlth=fields["health"]["length"],
|
||||
prin=fields["primary_node"]["header"],
|
||||
lprin=fields["primary_node"]["length"],
|
||||
vers=fields["pvc_version"]["header"],
|
||||
lvers=fields["pvc_version"]["length"],
|
||||
nods=fields["nodes"]["header"],
|
||||
lnods=fields["nodes"]["length"],
|
||||
vms=fields["vms"]["header"],
|
||||
lvms=fields["vms"]["length"],
|
||||
nets=fields["networks"]["header"],
|
||||
lnets=fields["networks"]["length"],
|
||||
osds=fields["osds"]["header"],
|
||||
losds=fields["osds"]["length"],
|
||||
pols=fields["pools"]["header"],
|
||||
lpols=fields["pools"]["length"],
|
||||
vols=fields["volumes"]["header"],
|
||||
lvols=fields["volumes"]["length"],
|
||||
snts=fields["snapshots"]["header"],
|
||||
lsnts=fields["snapshots"]["length"],
|
||||
)
|
||||
)
|
||||
|
||||
# Add a line per connection
|
||||
for connection in data:
|
||||
if connection["health"] == "N/A":
|
||||
health_value = "N/A"
|
||||
health_colour = ansii["purple"]
|
||||
else:
|
||||
health_value = f"{connection['health']}%"
|
||||
if connection["maintenance"] == "true":
|
||||
health_colour = ansii["blue"]
|
||||
elif connection["health"] > 90:
|
||||
health_colour = ansii["green"]
|
||||
elif connection["health"] > 50:
|
||||
health_colour = ansii["yellow"]
|
||||
else:
|
||||
health_colour = ansii["red"]
|
||||
|
||||
output.append(
|
||||
line.format(
|
||||
bold="",
|
||||
end="",
|
||||
chlth=health_colour,
|
||||
endc=ansii["end"],
|
||||
name=connection["name"],
|
||||
lname=fields["name"]["length"],
|
||||
desc=connection["description"],
|
||||
ldesc=fields["description"]["length"],
|
||||
hlth=health_value,
|
||||
lhlth=fields["health"]["length"],
|
||||
prin=connection["primary_node"],
|
||||
lprin=fields["primary_node"]["length"],
|
||||
vers=connection["pvc_version"],
|
||||
lvers=fields["pvc_version"]["length"],
|
||||
nods=connection["nodes"],
|
||||
lnods=fields["nodes"]["length"],
|
||||
vms=connection["vms"],
|
||||
lvms=fields["vms"]["length"],
|
||||
nets=connection["networks"],
|
||||
lnets=fields["networks"]["length"],
|
||||
osds=connection["osds"],
|
||||
losds=fields["osds"]["length"],
|
||||
pols=connection["pools"],
|
||||
lpols=fields["pools"]["length"],
|
||||
vols=connection["volumes"],
|
||||
lvols=fields["volumes"]["length"],
|
||||
snts=connection["snapshots"],
|
||||
lsnts=fields["snapshots"]["length"],
|
||||
)
|
||||
)
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def cli_node_info_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the basic output of cli_node_info
|
||||
"""
|
||||
|
||||
return node_format_info(CLI_CONFIG, data, long_output=False)
|
||||
|
||||
|
||||
def cli_node_info_format_long(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the full output of cli_node_info
|
||||
"""
|
||||
|
||||
return node_format_info(CLI_CONFIG, data, long_output=True)
|
||||
|
||||
|
||||
def cli_node_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_node_list
|
||||
"""
|
||||
|
||||
return node_format_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_vm_tag_get_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_vm_tag_get
|
||||
"""
|
||||
|
||||
return vm_format_tags(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_vm_vcpu_get_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_vm_vcpu_get
|
||||
"""
|
||||
|
||||
return vm_format_vcpus(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_vm_memory_get_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_vm_memory_get
|
||||
"""
|
||||
|
||||
return vm_format_memory(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_vm_network_get_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_vm_network_get
|
||||
"""
|
||||
|
||||
return vm_format_networks(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_vm_volume_get_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_vm_volume_get
|
||||
"""
|
||||
|
||||
return vm_format_volumes(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_vm_info_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the basic output of cli_vm_info
|
||||
"""
|
||||
|
||||
return vm_format_info(CLI_CONFIG, data, long_output=False)
|
||||
|
||||
|
||||
def cli_vm_info_format_long(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the full output of cli_vm_info
|
||||
"""
|
||||
|
||||
return vm_format_info(CLI_CONFIG, data, long_output=True)
|
||||
|
||||
|
||||
def cli_vm_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_vm_list
|
||||
"""
|
||||
|
||||
return vm_format_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_network_info_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the full output of cli_network_info
|
||||
"""
|
||||
|
||||
return network_format_info(CLI_CONFIG, data, long_output=True)
|
||||
|
||||
|
||||
def cli_network_info_format_long(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the full output of cli_network_info
|
||||
"""
|
||||
|
||||
return network_format_info(CLI_CONFIG, data, long_output=True)
|
||||
|
||||
|
||||
def cli_network_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_network_list
|
||||
"""
|
||||
|
||||
return network_format_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_network_dhcp_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_network_dhcp_list
|
||||
"""
|
||||
|
||||
return network_format_dhcp_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_network_acl_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_network_acl_list
|
||||
"""
|
||||
|
||||
return network_format_acl_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_network_sriov_pf_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_network_sriov_pf_list
|
||||
"""
|
||||
|
||||
return network_format_sriov_pf_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_network_sriov_vf_info_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_network_sriov_vf_info
|
||||
"""
|
||||
|
||||
return network_format_sriov_vf_info(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_network_sriov_vf_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_network_sriov_vf_list
|
||||
"""
|
||||
|
||||
return network_format_sriov_vf_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_status_format_raw(CLI_CONFIG, data):
|
||||
"""
|
||||
Direct format the output of cli_storage_status
|
||||
"""
|
||||
|
||||
return storage_format_raw(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_util_format_raw(CLI_CONFIG, data):
|
||||
"""
|
||||
Direct format the output of cli_storage_util
|
||||
"""
|
||||
|
||||
return storage_format_raw(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_benchmark_info_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_storage_benchmark_info
|
||||
"""
|
||||
|
||||
return storage_format_benchmark_info(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_benchmark_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_storage_benchmark_list
|
||||
"""
|
||||
|
||||
return storage_format_benchmark_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_osd_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_storage_osd_list
|
||||
"""
|
||||
|
||||
return storage_format_osd_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_pool_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_storage_pool_list
|
||||
"""
|
||||
|
||||
return storage_format_pool_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_volume_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_storage_volume_list
|
||||
"""
|
||||
|
||||
return storage_format_volume_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_storage_snapshot_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_storage_snapshot_list
|
||||
"""
|
||||
|
||||
return storage_format_snapshot_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_provisioner_template_system_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_template_system_list
|
||||
"""
|
||||
|
||||
return provisioner_format_template_list(CLI_CONFIG, data, template_type="system")
|
||||
|
||||
|
||||
def cli_provisioner_template_network_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_template_network_list
|
||||
"""
|
||||
|
||||
return provisioner_format_template_list(CLI_CONFIG, data, template_type="network")
|
||||
|
||||
|
||||
def cli_provisioner_template_storage_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_template_storage_list
|
||||
"""
|
||||
|
||||
return provisioner_format_template_list(CLI_CONFIG, data, template_type="storage")
|
||||
|
||||
|
||||
def cli_provisioner_userdata_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_userdata_list
|
||||
"""
|
||||
|
||||
return provisioner_format_userdata_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_provisioner_script_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_script_list
|
||||
"""
|
||||
|
||||
return provisioner_format_script_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_provisioner_ova_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_ova_list
|
||||
"""
|
||||
|
||||
return provisioner_format_ova_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_provisioner_profile_list_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_profile_list
|
||||
"""
|
||||
|
||||
return provisioner_format_profile_list(CLI_CONFIG, data)
|
||||
|
||||
|
||||
def cli_provisioner_status_format_pretty(CLI_CONFIG, data):
|
||||
"""
|
||||
Pretty format the output of cli_provisioner_status
|
||||
"""
|
||||
|
||||
return provisioner_format_task_status(CLI_CONFIG, data)
|
241
client-cli/pvc/cli/helpers.py
Normal file
@ -0,0 +1,241 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# helpers.py - PVC Click CLI helper function library
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from click import echo as click_echo
|
||||
from click import progressbar
|
||||
from distutils.util import strtobool
|
||||
from json import load as jload
|
||||
from json import dump as jdump
|
||||
from os import chmod, environ, getpid, path
|
||||
from socket import gethostname
|
||||
from sys import argv
|
||||
from syslog import syslog, openlog, closelog, LOG_AUTH
|
||||
from time import sleep
|
||||
from yaml import load as yload
|
||||
from yaml import BaseLoader
|
||||
|
||||
import pvc.lib.provisioner
|
||||
|
||||
|
||||
DEFAULT_STORE_DATA = {"cfgfile": "/etc/pvc/pvcapid.yaml"}
|
||||
DEFAULT_STORE_FILENAME = "pvc.json"
|
||||
DEFAULT_API_PREFIX = "/api/v1"
|
||||
DEFAULT_NODE_HOSTNAME = gethostname().split(".")[0]
|
||||
|
||||
|
||||
def echo(config, message, newline=True, stderr=False):
|
||||
"""
|
||||
Output a message with click.echo respecting our configuration
|
||||
"""
|
||||
|
||||
if config.get("colour", False):
|
||||
colour = True
|
||||
else:
|
||||
colour = None
|
||||
|
||||
if config.get("silent", False):
|
||||
pass
|
||||
elif config.get("quiet", False) and stderr:
|
||||
pass
|
||||
else:
|
||||
click_echo(message=message, color=colour, nl=newline, err=stderr)
|
||||
|
||||
|
||||
def audit():
|
||||
"""
|
||||
Log an audit message to the local syslog AUTH facility
|
||||
"""
|
||||
|
||||
args = argv
|
||||
args[0] = "pvc"
|
||||
pid = getpid()
|
||||
|
||||
openlog(facility=LOG_AUTH, ident=f"{args[0]}[{pid}]")
|
||||
syslog(
|
||||
f"""client audit: command "{' '.join(args)}" by user {environ.get('USER', None)}"""
|
||||
)
|
||||
closelog()
|
||||
|
||||
|
||||
def read_config_from_yaml(cfgfile):
|
||||
"""
|
||||
Read the PVC API configuration from the local API configuration file
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(cfgfile) as fh:
|
||||
api_config = yload(fh, Loader=BaseLoader)["pvc"]["api"]
|
||||
|
||||
host = api_config["listen_address"]
|
||||
port = api_config["listen_port"]
|
||||
scheme = "https" if strtobool(api_config["ssl"]["enabled"]) else "http"
|
||||
api_key = (
|
||||
api_config["authentication"]["tokens"][0]["token"]
|
||||
if strtobool(api_config["authentication"]["enabled"])
|
||||
else None
|
||||
)
|
||||
except KeyError:
|
||||
host = None
|
||||
port = None
|
||||
scheme = None
|
||||
api_key = None
|
||||
|
||||
return cfgfile, host, port, scheme, api_key
|
||||
|
||||
|
||||
def get_config(store_data, connection=None):
|
||||
"""
|
||||
Load CLI configuration from store data
|
||||
"""
|
||||
|
||||
if store_data is None:
|
||||
return {"badcfg": True}
|
||||
|
||||
connection_details = store_data.get(connection, None)
|
||||
|
||||
if not connection_details:
|
||||
connection = "local"
|
||||
connection_details = DEFAULT_STORE_DATA
|
||||
|
||||
if connection_details.get("cfgfile", None) is not None:
|
||||
if path.isfile(connection_details.get("cfgfile", None)):
|
||||
description, host, port, scheme, api_key = read_config_from_yaml(
|
||||
connection_details.get("cfgfile", None)
|
||||
)
|
||||
if None in [description, host, port, scheme]:
|
||||
return {"badcfg": True}
|
||||
else:
|
||||
return {"badcfg": True}
|
||||
# Rewrite a wildcard listener to use localhost instead
|
||||
if host == "0.0.0.0":
|
||||
host = "127.0.0.1"
|
||||
else:
|
||||
# This is a static configuration, get the details directly
|
||||
description = connection_details["description"]
|
||||
host = connection_details["host"]
|
||||
port = connection_details["port"]
|
||||
scheme = connection_details["scheme"]
|
||||
api_key = connection_details["api_key"]
|
||||
|
||||
config = dict()
|
||||
config["debug"] = False
|
||||
config["connection"] = connection
|
||||
config["description"] = description
|
||||
config["api_host"] = f"{host}:{port}"
|
||||
config["api_scheme"] = scheme
|
||||
config["api_key"] = api_key
|
||||
config["api_prefix"] = DEFAULT_API_PREFIX
|
||||
if connection == "local":
|
||||
config["verify_ssl"] = False
|
||||
else:
|
||||
config["verify_ssl"] = bool(
|
||||
strtobool(environ.get("PVC_CLIENT_VERIFY_SSL", "True"))
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def get_store(store_path):
|
||||
"""
|
||||
Load store information from the store path
|
||||
"""
|
||||
|
||||
store_file = f"{store_path}/{DEFAULT_STORE_FILENAME}"
|
||||
|
||||
with open(store_file) as fh:
|
||||
try:
|
||||
store_data = jload(fh)
|
||||
return store_data
|
||||
except Exception:
|
||||
return dict()
|
||||
|
||||
|
||||
def update_store(store_path, store_data):
|
||||
"""
|
||||
Update store information to the store path, creating it (with sensible permissions) if needed
|
||||
"""
|
||||
|
||||
store_file = f"{store_path}/{DEFAULT_STORE_FILENAME}"
|
||||
|
||||
if not path.exists(store_file):
|
||||
with open(store_file, "w") as fh:
|
||||
fh.write("")
|
||||
chmod(store_file, int(environ.get("PVC_CLIENT_DB_PERMS", "600"), 8))
|
||||
|
||||
with open(store_file, "w") as fh:
|
||||
jdump(store_data, fh, sort_keys=True, indent=4)
|
||||
|
||||
|
||||
def wait_for_provisioner(CLI_CONFIG, task_id):
|
||||
"""
|
||||
Wait for a provisioner task to complete
|
||||
"""
|
||||
|
||||
echo(CLI_CONFIG, f"Task ID: {task_id}")
|
||||
echo(CLI_CONFIG, "")
|
||||
|
||||
# Wait for the task to start
|
||||
echo(CLI_CONFIG, "Waiting for task to start...", newline=False)
|
||||
while True:
|
||||
sleep(1)
|
||||
task_status = pvc.lib.provisioner.task_status(
|
||||
CLI_CONFIG, task_id, is_watching=True
|
||||
)
|
||||
if task_status.get("state") != "PENDING":
|
||||
break
|
||||
echo(".", newline=False)
|
||||
echo(CLI_CONFIG, " done.")
|
||||
echo(CLI_CONFIG, "")
|
||||
|
||||
# Start following the task state, updating progress as we go
|
||||
total_task = task_status.get("total")
|
||||
with progressbar(length=total_task, show_eta=False) as bar:
|
||||
last_task = 0
|
||||
maxlen = 0
|
||||
while True:
|
||||
sleep(1)
|
||||
if task_status.get("state") != "RUNNING":
|
||||
break
|
||||
if task_status.get("current") > last_task:
|
||||
current_task = int(task_status.get("current"))
|
||||
bar.update(current_task - last_task)
|
||||
last_task = current_task
|
||||
# The extensive spaces at the end cause this to overwrite longer previous messages
|
||||
curlen = len(str(task_status.get("status")))
|
||||
if curlen > maxlen:
|
||||
maxlen = curlen
|
||||
lendiff = maxlen - curlen
|
||||
overwrite_whitespace = " " * lendiff
|
||||
echo(
|
||||
CLI_CONFIG,
|
||||
" " + task_status.get("status") + overwrite_whitespace,
|
||||
newline=False,
|
||||
)
|
||||
task_status = pvc.lib.provisioner.task_status(
|
||||
CLI_CONFIG, task_id, is_watching=True
|
||||
)
|
||||
if task_status.get("state") == "SUCCESS":
|
||||
bar.update(total_task - last_task)
|
||||
|
||||
echo(CLI_CONFIG, "")
|
||||
retdata = task_status.get("state") + ": " + task_status.get("status")
|
||||
|
||||
return retdata
|
124
client-cli/pvc/cli/parsers.py
Normal file
@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# parsers.py - PVC Click CLI data parser function library
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from os import path
|
||||
from re import sub
|
||||
|
||||
from pvc.cli.helpers import read_config_from_yaml, get_config
|
||||
|
||||
import pvc.lib.cluster
|
||||
|
||||
|
||||
def cli_connection_list_parser(connections_config, show_keys_flag):
|
||||
"""
|
||||
Parse connections_config into formatable data for cli_connection_list
|
||||
"""
|
||||
|
||||
connections_data = list()
|
||||
|
||||
for connection, details in connections_config.items():
|
||||
if details.get("cfgfile", None) is not None:
|
||||
if path.isfile(details.get("cfgfile")):
|
||||
description, address, port, scheme, api_key = read_config_from_yaml(
|
||||
details.get("cfgfile")
|
||||
)
|
||||
else:
|
||||
continue
|
||||
if not show_keys_flag and api_key is not None:
|
||||
api_key = sub(r"[a-z0-9]", "x", api_key)
|
||||
connections_data.append(
|
||||
{
|
||||
"name": connection,
|
||||
"description": description,
|
||||
"address": address,
|
||||
"port": port,
|
||||
"scheme": scheme,
|
||||
"api_key": api_key,
|
||||
}
|
||||
)
|
||||
else:
|
||||
if not show_keys_flag:
|
||||
details["api_key"] = sub(r"[a-z0-9]", "x", details["api_key"])
|
||||
connections_data.append(
|
||||
{
|
||||
"name": connection,
|
||||
"description": details["description"],
|
||||
"address": details["host"],
|
||||
"port": details["port"],
|
||||
"scheme": details["scheme"],
|
||||
"api_key": details["api_key"],
|
||||
}
|
||||
)
|
||||
|
||||
return connections_data
|
||||
|
||||
|
||||
def cli_connection_detail_parser(connections_config):
|
||||
"""
|
||||
Parse connections_config into formatable data for cli_connection_detail
|
||||
"""
|
||||
connections_data = list()
|
||||
for connection, details in connections_config.items():
|
||||
cluster_config = get_config(connections_config, connection=connection)
|
||||
if cluster_config.get("badcfg", False):
|
||||
continue
|
||||
# Connect to each API and gather cluster status
|
||||
retcode, retdata = pvc.lib.cluster.get_info(cluster_config)
|
||||
if retcode == 0:
|
||||
# Create dummy data of N/A for all fields
|
||||
connections_data.append(
|
||||
{
|
||||
"name": cluster_config["connection"],
|
||||
"description": cluster_config["description"],
|
||||
"health": "N/A",
|
||||
"maintenance": "N/A",
|
||||
"primary_node": "N/A",
|
||||
"pvc_version": "N/A",
|
||||
"nodes": "N/A",
|
||||
"vms": "N/A",
|
||||
"networks": "N/A",
|
||||
"osds": "N/A",
|
||||
"pools": "N/A",
|
||||
"volumes": "N/A",
|
||||
"snapshots": "N/A",
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Normalize data into nice formattable version
|
||||
connections_data.append(
|
||||
{
|
||||
"name": cluster_config["connection"],
|
||||
"description": cluster_config["description"],
|
||||
"health": retdata.get("cluster_health", {}).get("health", "N/A"),
|
||||
"maintenance": retdata.get("maintenance", "N/A"),
|
||||
"primary_node": retdata.get("primary_node", "N/A"),
|
||||
"pvc_version": retdata.get("pvc_version", "N/A"),
|
||||
"nodes": retdata.get("nodes", {}).get("total", "N/A"),
|
||||
"vms": retdata.get("vms", {}).get("total", "N/A"),
|
||||
"networks": retdata.get("networks", "N/A"),
|
||||
"osds": retdata.get("osds", {}).get("total", "N/A"),
|
||||
"pools": retdata.get("pools", "N/A"),
|
||||
"volumes": retdata.get("volumes", "N/A"),
|
||||
"snapshots": retdata.get("snapshots", "N/A"),
|
||||
}
|
||||
)
|
||||
|
||||
return connections_data
|
64
client-cli/pvc/cli/waiters.py
Normal file
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# waiters.py - PVC Click CLI output waiters library
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from time import sleep, time
|
||||
|
||||
from pvc.cli.helpers import echo
|
||||
|
||||
import pvc.lib.node
|
||||
|
||||
|
||||
def cli_node_waiter(config, node, state_field, state_value):
|
||||
"""
|
||||
Wait for state transitions for cli_node tasks
|
||||
|
||||
{node} is the name of the node
|
||||
{state_field} is the node_info field to check for {state_value}
|
||||
{state_value} is the TRANSITIONAL value that, when no longer set, will terminate waiting
|
||||
"""
|
||||
|
||||
# Sleep for this long between API polls
|
||||
sleep_time = 1
|
||||
|
||||
# Print a dot after this many {sleep_time}s
|
||||
dot_time = 5
|
||||
|
||||
t_start = time()
|
||||
|
||||
echo(config, "Waiting...", newline=False)
|
||||
sleep(sleep_time)
|
||||
|
||||
count = 0
|
||||
while True:
|
||||
count += 1
|
||||
try:
|
||||
_retcode, _retdata = pvc.lib.node.node_info(config, node)
|
||||
if _retdata[state_field] != state_value:
|
||||
break
|
||||
else:
|
||||
raise ValueError
|
||||
except Exception:
|
||||
sleep(sleep_time)
|
||||
if count % dot_time == 0:
|
||||
echo(config, ".", newline=False)
|
||||
|
||||
t_end = time()
|
||||
echo(config, f" done. [{int(t_end - t_start)}s]")
|
0
client-cli/pvc/lib/__init__.py
Normal file
97
client-cli/pvc/lib/ansiprint.py
Normal file
@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# ansiprint.py - Printing function for formatted messages
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import datetime
|
||||
|
||||
|
||||
# ANSII colours for output
|
||||
def red():
|
||||
return "\033[91m"
|
||||
|
||||
|
||||
def blue():
|
||||
return "\033[94m"
|
||||
|
||||
|
||||
def cyan():
|
||||
return "\033[96m"
|
||||
|
||||
|
||||
def green():
|
||||
return "\033[92m"
|
||||
|
||||
|
||||
def yellow():
|
||||
return "\033[93m"
|
||||
|
||||
|
||||
def purple():
|
||||
return "\033[95m"
|
||||
|
||||
|
||||
def bold():
|
||||
return "\033[1m"
|
||||
|
||||
|
||||
def end():
|
||||
return "\033[0m"
|
||||
|
||||
|
||||
# Print function
|
||||
def echo(message, prefix, state):
|
||||
# Get the date
|
||||
date = "{} - ".format(datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S.%f"))
|
||||
endc = end()
|
||||
|
||||
# Continuation
|
||||
if state == "c":
|
||||
date = ""
|
||||
colour = ""
|
||||
prompt = " "
|
||||
# OK
|
||||
elif state == "o":
|
||||
colour = green()
|
||||
prompt = ">>> "
|
||||
# Error
|
||||
elif state == "e":
|
||||
colour = red()
|
||||
prompt = ">>> "
|
||||
# Warning
|
||||
elif state == "w":
|
||||
colour = yellow()
|
||||
prompt = ">>> "
|
||||
# Tick
|
||||
elif state == "t":
|
||||
colour = purple()
|
||||
prompt = ">>> "
|
||||
# Information
|
||||
elif state == "i":
|
||||
colour = blue()
|
||||
prompt = ">>> "
|
||||
else:
|
||||
colour = bold()
|
||||
prompt = ">>> "
|
||||
|
||||
# Append space to prefix
|
||||
if prefix != "":
|
||||
prefix = prefix + " "
|
||||
|
||||
print(colour + prompt + endc + date + prefix + message)
|
116
client-cli/pvc/lib/cluster.py
Normal file
@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# cluster.py - PVC CLI client function library, cluster management
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import json
|
||||
|
||||
from pvc.lib.common import call_api
|
||||
|
||||
|
||||
def initialize(config, overwrite=False):
|
||||
"""
|
||||
Initialize the PVC cluster
|
||||
|
||||
API endpoint: GET /api/v1/initialize
|
||||
API arguments: overwrite, yes-i-really-mean-it
|
||||
API schema: {json_data_object}
|
||||
"""
|
||||
params = {"yes-i-really-mean-it": "yes", "overwrite": overwrite}
|
||||
response = call_api(config, "post", "/initialize", params=params)
|
||||
|
||||
if response.status_code == 200:
|
||||
retstatus = True
|
||||
else:
|
||||
retstatus = False
|
||||
|
||||
return retstatus, response.json().get("message", "")
|
||||
|
||||
|
||||
def backup(config):
|
||||
"""
|
||||
Get a JSON backup of the cluster
|
||||
|
||||
API endpoint: GET /api/v1/backup
|
||||
API arguments:
|
||||
API schema: {json_data_object}
|
||||
"""
|
||||
response = call_api(config, "get", "/backup")
|
||||
|
||||
if response.status_code == 200:
|
||||
return True, response.json()
|
||||
else:
|
||||
return False, response.json().get("message", "")
|
||||
|
||||
|
||||
def restore(config, cluster_data):
|
||||
"""
|
||||
Restore a JSON backup to the cluster
|
||||
|
||||
API endpoint: POST /api/v1/restore
|
||||
API arguments: yes-i-really-mean-it
|
||||
API schema: {json_data_object}
|
||||
"""
|
||||
cluster_data_json = json.dumps(cluster_data)
|
||||
|
||||
params = {"yes-i-really-mean-it": "yes"}
|
||||
data = {"cluster_data": cluster_data_json}
|
||||
response = call_api(config, "post", "/restore", params=params, data=data)
|
||||
|
||||
if response.status_code == 200:
|
||||
retstatus = True
|
||||
else:
|
||||
retstatus = False
|
||||
|
||||
return retstatus, response.json().get("message", "")
|
||||
|
||||
|
||||
def maintenance_mode(config, state):
|
||||
"""
|
||||
Enable or disable PVC cluster maintenance mode
|
||||
|
||||
API endpoint: POST /api/v1/status
|
||||
API arguments: {state}={state}
|
||||
API schema: {json_data_object}
|
||||
"""
|
||||
params = {"state": state}
|
||||
response = call_api(config, "post", "/status", params=params)
|
||||
|
||||
if response.status_code == 200:
|
||||
retstatus = True
|
||||
else:
|
||||
retstatus = False
|
||||
|
||||
return retstatus, response.json().get("message", "")
|
||||
|
||||
|
||||
def get_info(config):
|
||||
"""
|
||||
Get status of the PVC cluster
|
||||
|
||||
API endpoint: GET /api/v1/status
|
||||
API arguments:
|
||||
API schema: {json_data_object}
|
||||
"""
|
||||
response = call_api(config, "get", "/status")
|
||||
|
||||
if response.status_code == 200:
|
||||
return True, response.json()
|
||||
else:
|
||||
return False, response.json().get("message", "")
|
201
client-cli/pvc/lib/common.py
Normal file
@ -0,0 +1,201 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# common.py - PVC CLI client function library, Common functions
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import os
|
||||
import math
|
||||
import time
|
||||
import requests
|
||||
import click
|
||||
from urllib3 import disable_warnings
|
||||
|
||||
|
||||
def format_bytes(size_bytes):
|
||||
byte_unit_matrix = {
|
||||
"B": 1,
|
||||
"K": 1024,
|
||||
"M": 1024 * 1024,
|
||||
"G": 1024 * 1024 * 1024,
|
||||
"T": 1024 * 1024 * 1024 * 1024,
|
||||
"P": 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
}
|
||||
human_bytes = "0B"
|
||||
for unit in sorted(byte_unit_matrix, key=byte_unit_matrix.get):
|
||||
formatted_bytes = int(math.ceil(size_bytes / byte_unit_matrix[unit]))
|
||||
if formatted_bytes < 10000:
|
||||
human_bytes = "{}{}".format(formatted_bytes, unit)
|
||||
break
|
||||
return human_bytes
|
||||
|
||||
|
||||
def format_metric(integer):
|
||||
integer_unit_matrix = {
|
||||
"": 1,
|
||||
"K": 1000,
|
||||
"M": 1000 * 1000,
|
||||
"B": 1000 * 1000 * 1000,
|
||||
"T": 1000 * 1000 * 1000 * 1000,
|
||||
"Q": 1000 * 1000 * 1000 * 1000 * 1000,
|
||||
}
|
||||
human_integer = "0"
|
||||
for unit in sorted(integer_unit_matrix, key=integer_unit_matrix.get):
|
||||
formatted_integer = int(math.ceil(integer / integer_unit_matrix[unit]))
|
||||
if formatted_integer < 10000:
|
||||
human_integer = "{}{}".format(formatted_integer, unit)
|
||||
break
|
||||
return human_integer
|
||||
|
||||
|
||||
class UploadProgressBar(object):
|
||||
def __init__(self, filename, end_message="", end_nl=True):
|
||||
file_size = os.path.getsize(filename)
|
||||
file_size_human = format_bytes(file_size)
|
||||
click.echo("Uploading file (total size {})...".format(file_size_human))
|
||||
|
||||
self.length = file_size
|
||||
self.time_last = int(round(time.time() * 1000)) - 1000
|
||||
self.bytes_last = 0
|
||||
self.bytes_diff = 0
|
||||
self.is_end = False
|
||||
|
||||
self.end_message = end_message
|
||||
self.end_nl = end_nl
|
||||
if not self.end_nl:
|
||||
self.end_suffix = " "
|
||||
else:
|
||||
self.end_suffix = ""
|
||||
|
||||
self.bar = click.progressbar(length=self.length, show_eta=True)
|
||||
|
||||
def update(self, monitor):
|
||||
bytes_cur = monitor.bytes_read
|
||||
self.bytes_diff += bytes_cur - self.bytes_last
|
||||
if self.bytes_last == bytes_cur:
|
||||
self.is_end = True
|
||||
self.bytes_last = bytes_cur
|
||||
|
||||
time_cur = int(round(time.time() * 1000))
|
||||
if (time_cur - 1000) > self.time_last:
|
||||
self.time_last = time_cur
|
||||
self.bar.update(self.bytes_diff)
|
||||
self.bytes_diff = 0
|
||||
|
||||
if self.is_end:
|
||||
self.bar.update(self.bytes_diff)
|
||||
self.bytes_diff = 0
|
||||
click.echo()
|
||||
click.echo()
|
||||
if self.end_message:
|
||||
click.echo(self.end_message + self.end_suffix, nl=self.end_nl)
|
||||
|
||||
|
||||
class ErrorResponse(requests.Response):
|
||||
def __init__(self, json_data, status_code):
|
||||
self.json_data = json_data
|
||||
self.status_code = status_code
|
||||
|
||||
def json(self):
|
||||
return self.json_data
|
||||
|
||||
|
||||
def call_api(
|
||||
config,
|
||||
operation,
|
||||
request_uri,
|
||||
headers={},
|
||||
params=None,
|
||||
data=None,
|
||||
files=None,
|
||||
):
|
||||
# Set the connect timeout to 2 seconds but extremely long (48 hour) data timeout
|
||||
timeout = (2.05, 172800)
|
||||
|
||||
# Craft the URI
|
||||
uri = "{}://{}{}{}".format(
|
||||
config["api_scheme"], config["api_host"], config["api_prefix"], request_uri
|
||||
)
|
||||
|
||||
# Craft the authentication header if required
|
||||
if config["api_key"]:
|
||||
headers["X-Api-Key"] = config["api_key"]
|
||||
|
||||
# Determine the request type and hit the API
|
||||
disable_warnings()
|
||||
try:
|
||||
if operation == "get":
|
||||
response = requests.get(
|
||||
uri,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
verify=config["verify_ssl"],
|
||||
)
|
||||
if operation == "post":
|
||||
response = requests.post(
|
||||
uri,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
files=files,
|
||||
verify=config["verify_ssl"],
|
||||
)
|
||||
if operation == "put":
|
||||
response = requests.put(
|
||||
uri,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
files=files,
|
||||
verify=config["verify_ssl"],
|
||||
)
|
||||
if operation == "patch":
|
||||
response = requests.patch(
|
||||
uri,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
verify=config["verify_ssl"],
|
||||
)
|
||||
if operation == "delete":
|
||||
response = requests.delete(
|
||||
uri,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
params=params,
|
||||
data=data,
|
||||
verify=config["verify_ssl"],
|
||||
)
|
||||
except Exception as e:
|
||||
message = "Failed to connect to the API: {}".format(e)
|
||||
response = ErrorResponse({"message": message}, 500)
|
||||
|
||||
# Display debug output
|
||||
if config["debug"]:
|
||||
click.echo("API endpoint: {}".format(uri), err=True)
|
||||
click.echo("Response code: {}".format(response.status_code), err=True)
|
||||
click.echo("Response headers: {}".format(response.headers), err=True)
|
||||
click.echo(err=True)
|
||||
|
||||
# Return the response object
|
||||
return response
|
1495
client-cli/pvc/lib/network.py
Normal file
706
client-cli/pvc/lib/node.py
Normal file
@ -0,0 +1,706 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# node.py - PVC CLI client function library, node management
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import time
|
||||
|
||||
import pvc.lib.ansiprint as ansiprint
|
||||
from pvc.lib.common import call_api
|
||||
|
||||
|
||||
#
|
||||
# Primary functions
|
||||
#
|
||||
def node_coordinator_state(config, node, action):
|
||||
"""
|
||||
Set node coordinator state state (primary/secondary)
|
||||
|
||||
API endpoint: POST /api/v1/node/{node}/coordinator-state
|
||||
API arguments: action={action}
|
||||
API schema: {"message": "{data}"}
|
||||
"""
|
||||
params = {"state": action}
|
||||
response = call_api(
|
||||
config,
|
||||
"post",
|
||||
"/node/{node}/coordinator-state".format(node=node),
|
||||
params=params,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
retstatus = True
|
||||
else:
|
||||
retstatus = False
|
||||
|
||||
return retstatus, response.json().get("message", "")
|
||||
|
||||
|
||||
def node_domain_state(config, node, action):
|
||||
"""
|
||||
Set node domain state state (flush/ready)
|
||||
|
||||
API endpoint: POST /api/v1/node/{node}/domain-state
|
||||
API arguments: action={action}, wait={wait}
|
||||
API schema: {"message": "{data}"}
|
||||
"""
|
||||
params = {"state": action}
|
||||
response = call_api(
|
||||
config, "post", "/node/{node}/domain-state".format(node=node), params=params
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
retstatus = True
|
||||
else:
|
||||
retstatus = False
|
||||
|
||||
return retstatus, response.json().get("message", "")
|
||||
|
||||
|
||||
def view_node_log(config, node, lines=100):
|
||||
"""
|
||||
Return node log lines from the API (and display them in a pager in the main CLI)
|
||||
|
||||
API endpoint: GET /node/{node}/log
|
||||
API arguments: lines={lines}
|
||||
API schema: {"name":"{node}","data":"{node_log}"}
|
||||
"""
|
||||
params = {"lines": lines}
|
||||
response = call_api(
|
||||
config, "get", "/node/{node}/log".format(node=node), params=params
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return False, response.json().get("message", "")
|
||||
|
||||
node_log = response.json()["data"]
|
||||
|
||||
# Shrink the log buffer to length lines
|
||||
shrunk_log = node_log.split("\n")[-lines:]
|
||||
loglines = "\n".join(shrunk_log)
|
||||
|
||||
return True, loglines
|
||||
|
||||
|
||||
def follow_node_log(config, node, lines=10):
|
||||
"""
|
||||
Return and follow node log lines from the API
|
||||
|
||||
API endpoint: GET /node/{node}/log
|
||||
API arguments: lines={lines}
|
||||
API schema: {"name":"{nodename}","data":"{node_log}"}
|
||||
"""
|
||||
# We always grab 200 to match the follow call, but only _show_ `lines` number
|
||||
params = {"lines": 200}
|
||||
response = call_api(
|
||||
config, "get", "/node/{node}/log".format(node=node), params=params
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return False, response.json().get("message", "")
|
||||
|
||||
# Shrink the log buffer to length lines
|
||||
node_log = response.json()["data"]
|
||||
shrunk_log = node_log.split("\n")[-int(lines) :]
|
||||
loglines = "\n".join(shrunk_log)
|
||||
|
||||
# Print the initial data and begin following
|
||||
print(loglines, end="")
|
||||
print("\n", end="")
|
||||
|
||||
while True:
|
||||
# Grab the next line set (200 is a reasonable number of lines per half-second; any more are skipped)
|
||||
try:
|
||||
params = {"lines": 200}
|
||||
response = call_api(
|
||||
config, "get", "/node/{node}/log".format(node=node), params=params
|
||||
)
|
||||
new_node_log = response.json()["data"]
|
||||
except Exception:
|
||||
break
|
||||
# Split the new and old log strings into constitutent lines
|
||||
old_node_loglines = node_log.split("\n")
|
||||
new_node_loglines = new_node_log.split("\n")
|
||||
|
||||
# Set the node log to the new log value for the next iteration
|
||||
node_log = new_node_log
|
||||
|
||||
# Get the difference between the two sets of lines
|
||||
old_node_loglines_set = set(old_node_loglines)
|
||||
diff_node_loglines = [
|
||||
x for x in new_node_loglines if x not in old_node_loglines_set
|
||||
]
|
||||
|
||||
# If there's a difference, print it out
|
||||
if len(diff_node_loglines) > 0:
|
||||
print("\n".join(diff_node_loglines), end="")
|
||||
print("\n", end="")
|
||||
|
||||
# Wait half a second
|
||||
time.sleep(0.5)
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
def node_info(config, node):
|
||||
"""
|
||||
Get information about node
|
||||
|
||||
API endpoint: GET /api/v1/node/{node}
|
||||
API arguments:
|
||||
API schema: {json_data_object}
|
||||
"""
|
||||
response = call_api(config, "get", "/node/{node}".format(node=node))
|
||||
|
||||
if response.status_code == 200:
|
||||
if isinstance(response.json(), list) and len(response.json()) != 1:
|
||||
# No exact match, return not found
|
||||
return False, "Node not found."
|
||||
else:
|
||||
# Return a single instance if the response is a list
|
||||
if isinstance(response.json(), list):
|
||||
return True, response.json()[0]
|
||||
# This shouldn't happen, but is here just in case
|
||||
else:
|
||||
return True, response.json()
|
||||
else:
|
||||
return False, response.json().get("message", "")
|
||||
|
||||
|
||||
def node_list(
|
||||
config, limit, target_daemon_state, target_coordinator_state, target_domain_state
|
||||
):
|
||||
"""
|
||||
Get list information about nodes (limited by {limit})
|
||||
|
||||
API endpoint: GET /api/v1/node
|
||||
API arguments: limit={limit}
|
||||
API schema: [{json_data_object},{json_data_object},etc.]
|
||||
"""
|
||||
params = dict()
|
||||
if limit:
|
||||
params["limit"] = limit
|
||||
if target_daemon_state:
|
||||
params["daemon_state"] = target_daemon_state
|
||||
if target_coordinator_state:
|
||||
params["coordinator_state"] = target_coordinator_state
|
||||
if target_domain_state:
|
||||
params["domain_state"] = target_domain_state
|
||||
|
||||
response = call_api(config, "get", "/node", params=params)
|
||||
|
||||
if response.status_code == 200:
|
||||
return True, response.json()
|
||||
else:
|
||||
return False, response.json().get("message", "")
|
||||
|
||||
|
||||
#
|
||||
# Output display functions
|
||||
#
|
||||
def getOutputColours(node_information):
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
if node_health <= 50:
|
||||
health_colour = ansiprint.red()
|
||||
elif node_health <= 90:
|
||||
health_colour = ansiprint.yellow()
|
||||
elif node_health <= 100:
|
||||
health_colour = ansiprint.green()
|
||||
else:
|
||||
health_colour = ansiprint.blue()
|
||||
else:
|
||||
health_colour = ansiprint.blue()
|
||||
|
||||
if node_information["daemon_state"] == "run":
|
||||
daemon_state_colour = ansiprint.green()
|
||||
elif node_information["daemon_state"] == "stop":
|
||||
daemon_state_colour = ansiprint.red()
|
||||
elif node_information["daemon_state"] == "shutdown":
|
||||
daemon_state_colour = ansiprint.yellow()
|
||||
elif node_information["daemon_state"] == "init":
|
||||
daemon_state_colour = ansiprint.yellow()
|
||||
elif node_information["daemon_state"] == "dead":
|
||||
daemon_state_colour = ansiprint.red() + ansiprint.bold()
|
||||
else:
|
||||
daemon_state_colour = ansiprint.blue()
|
||||
|
||||
if node_information["coordinator_state"] == "primary":
|
||||
coordinator_state_colour = ansiprint.green()
|
||||
elif node_information["coordinator_state"] == "secondary":
|
||||
coordinator_state_colour = ansiprint.blue()
|
||||
else:
|
||||
coordinator_state_colour = ansiprint.cyan()
|
||||
|
||||
if node_information["domain_state"] == "ready":
|
||||
domain_state_colour = ansiprint.green()
|
||||
else:
|
||||
domain_state_colour = ansiprint.blue()
|
||||
|
||||
if node_information["memory"]["allocated"] > node_information["memory"]["total"]:
|
||||
mem_allocated_colour = ansiprint.yellow()
|
||||
else:
|
||||
mem_allocated_colour = ""
|
||||
|
||||
if node_information["memory"]["provisioned"] > node_information["memory"]["total"]:
|
||||
mem_provisioned_colour = ansiprint.yellow()
|
||||
else:
|
||||
mem_provisioned_colour = ""
|
||||
|
||||
return (
|
||||
health_colour,
|
||||
daemon_state_colour,
|
||||
coordinator_state_colour,
|
||||
domain_state_colour,
|
||||
mem_allocated_colour,
|
||||
mem_provisioned_colour,
|
||||
)
|
||||
|
||||
|
||||
def format_info(config, node_information, long_output):
|
||||
(
|
||||
health_colour,
|
||||
daemon_state_colour,
|
||||
coordinator_state_colour,
|
||||
domain_state_colour,
|
||||
mem_allocated_colour,
|
||||
mem_provisioned_colour,
|
||||
) = getOutputColours(node_information)
|
||||
|
||||
# Format a nice output; do this line-by-line then concat the elements at the end
|
||||
ainformation = []
|
||||
# Basic information
|
||||
ainformation.append(
|
||||
"{}Name:{} {}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
node_information["name"],
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}PVC Version:{} {}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
node_information["pvc_version"],
|
||||
)
|
||||
)
|
||||
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
node_health_text = f"{node_health}%"
|
||||
else:
|
||||
node_health_text = node_health
|
||||
ainformation.append(
|
||||
"{}Health:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
health_colour,
|
||||
node_health_text,
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
|
||||
node_health_details = node_information.get("health_details", [])
|
||||
if long_output:
|
||||
node_health_messages = "\n ".join(
|
||||
[f"{plugin['name']}: {plugin['message']}" for plugin in node_health_details]
|
||||
)
|
||||
else:
|
||||
node_health_messages = "\n ".join(
|
||||
[
|
||||
f"{plugin['name']}: {plugin['message']}"
|
||||
for plugin in node_health_details
|
||||
if int(plugin.get("health_delta", 0)) > 0
|
||||
]
|
||||
)
|
||||
|
||||
if len(node_health_messages) > 0:
|
||||
ainformation.append(
|
||||
"{}Health Plugin Details:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_health_messages
|
||||
)
|
||||
)
|
||||
ainformation.append("")
|
||||
|
||||
ainformation.append(
|
||||
"{}Daemon State:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
daemon_state_colour,
|
||||
node_information["daemon_state"],
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Coordinator State:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
coordinator_state_colour,
|
||||
node_information["coordinator_state"],
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Domain State:{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
domain_state_colour,
|
||||
node_information["domain_state"],
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
if long_output:
|
||||
ainformation.append("")
|
||||
ainformation.append(
|
||||
"{}Architecture:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["arch"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Operating System:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["os"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Kernel Version:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["kernel"]
|
||||
)
|
||||
)
|
||||
ainformation.append("")
|
||||
ainformation.append(
|
||||
"{}Active VM Count:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["domains_count"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Host CPUs:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["vcpu"]["total"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}vCPUs:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["vcpu"]["allocated"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Load:{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["load"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Total RAM (MiB):{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["memory"]["total"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Used RAM (MiB):{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["memory"]["used"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Free RAM (MiB):{} {}".format(
|
||||
ansiprint.purple(), ansiprint.end(), node_information["memory"]["free"]
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Allocated RAM (MiB):{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
mem_allocated_colour,
|
||||
node_information["memory"]["allocated"],
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
ainformation.append(
|
||||
"{}Provisioned RAM (MiB):{} {}{}{}".format(
|
||||
ansiprint.purple(),
|
||||
ansiprint.end(),
|
||||
mem_provisioned_colour,
|
||||
node_information["memory"]["provisioned"],
|
||||
ansiprint.end(),
|
||||
)
|
||||
)
|
||||
|
||||
# Join it all together
|
||||
ainformation.append("")
|
||||
return "\n".join(ainformation)
|
||||
|
||||
|
||||
def format_list(config, node_list):
|
||||
if node_list == "Node not found.":
|
||||
return node_list
|
||||
|
||||
node_list_output = []
|
||||
|
||||
# Determine optimal column widths
|
||||
node_name_length = 5
|
||||
pvc_version_length = 8
|
||||
health_length = 7
|
||||
daemon_state_length = 7
|
||||
coordinator_state_length = 12
|
||||
domain_state_length = 7
|
||||
domains_count_length = 4
|
||||
cpu_count_length = 6
|
||||
load_length = 5
|
||||
mem_total_length = 6
|
||||
mem_used_length = 5
|
||||
mem_free_length = 5
|
||||
mem_alloc_length = 6
|
||||
mem_prov_length = 5
|
||||
for node_information in node_list:
|
||||
# node_name column
|
||||
_node_name_length = len(node_information["name"]) + 1
|
||||
if _node_name_length > node_name_length:
|
||||
node_name_length = _node_name_length
|
||||
# node_pvc_version column
|
||||
_pvc_version_length = len(node_information.get("pvc_version", "N/A")) + 1
|
||||
if _pvc_version_length > pvc_version_length:
|
||||
pvc_version_length = _pvc_version_length
|
||||
# node_health column
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
node_health_text = f"{node_health}%"
|
||||
else:
|
||||
node_health_text = node_health
|
||||
_health_length = len(node_health_text) + 1
|
||||
if _health_length > health_length:
|
||||
health_length = _health_length
|
||||
# daemon_state column
|
||||
_daemon_state_length = len(node_information["daemon_state"]) + 1
|
||||
if _daemon_state_length > daemon_state_length:
|
||||
daemon_state_length = _daemon_state_length
|
||||
# coordinator_state column
|
||||
_coordinator_state_length = len(node_information["coordinator_state"]) + 1
|
||||
if _coordinator_state_length > coordinator_state_length:
|
||||
coordinator_state_length = _coordinator_state_length
|
||||
# domain_state column
|
||||
_domain_state_length = len(node_information["domain_state"]) + 1
|
||||
if _domain_state_length > domain_state_length:
|
||||
domain_state_length = _domain_state_length
|
||||
# domains_count column
|
||||
_domains_count_length = len(str(node_information["domains_count"])) + 1
|
||||
if _domains_count_length > domains_count_length:
|
||||
domains_count_length = _domains_count_length
|
||||
# cpu_count column
|
||||
_cpu_count_length = len(str(node_information["cpu_count"])) + 1
|
||||
if _cpu_count_length > cpu_count_length:
|
||||
cpu_count_length = _cpu_count_length
|
||||
# load column
|
||||
_load_length = len(str(node_information["load"])) + 1
|
||||
if _load_length > load_length:
|
||||
load_length = _load_length
|
||||
# mem_total column
|
||||
_mem_total_length = len(str(node_information["memory"]["total"])) + 1
|
||||
if _mem_total_length > mem_total_length:
|
||||
mem_total_length = _mem_total_length
|
||||
# mem_used column
|
||||
_mem_used_length = len(str(node_information["memory"]["used"])) + 1
|
||||
if _mem_used_length > mem_used_length:
|
||||
mem_used_length = _mem_used_length
|
||||
# mem_free column
|
||||
_mem_free_length = len(str(node_information["memory"]["free"])) + 1
|
||||
if _mem_free_length > mem_free_length:
|
||||
mem_free_length = _mem_free_length
|
||||
# mem_alloc column
|
||||
_mem_alloc_length = len(str(node_information["memory"]["allocated"])) + 1
|
||||
if _mem_alloc_length > mem_alloc_length:
|
||||
mem_alloc_length = _mem_alloc_length
|
||||
|
||||
# mem_prov column
|
||||
_mem_prov_length = len(str(node_information["memory"]["provisioned"])) + 1
|
||||
if _mem_prov_length > mem_prov_length:
|
||||
mem_prov_length = _mem_prov_length
|
||||
|
||||
# Format the string (header)
|
||||
node_list_output.append(
|
||||
"{bold}{node_header: <{node_header_length}} {state_header: <{state_header_length}} {resource_header: <{resource_header_length}} {memory_header: <{memory_header_length}}{end_bold}".format(
|
||||
node_header_length=node_name_length
|
||||
+ pvc_version_length
|
||||
+ health_length
|
||||
+ 2,
|
||||
state_header_length=daemon_state_length
|
||||
+ coordinator_state_length
|
||||
+ domain_state_length
|
||||
+ 2,
|
||||
resource_header_length=domains_count_length
|
||||
+ cpu_count_length
|
||||
+ load_length
|
||||
+ 2,
|
||||
memory_header_length=mem_total_length
|
||||
+ mem_used_length
|
||||
+ mem_free_length
|
||||
+ mem_alloc_length
|
||||
+ mem_prov_length
|
||||
+ 4,
|
||||
bold=ansiprint.bold(),
|
||||
end_bold=ansiprint.end(),
|
||||
node_header="Nodes "
|
||||
+ "".join(
|
||||
[
|
||||
"-"
|
||||
for _ in range(
|
||||
6, node_name_length + pvc_version_length + health_length + 1
|
||||
)
|
||||
]
|
||||
),
|
||||
state_header="States "
|
||||
+ "".join(
|
||||
[
|
||||
"-"
|
||||
for _ in range(
|
||||
7,
|
||||
daemon_state_length
|
||||
+ coordinator_state_length
|
||||
+ domain_state_length
|
||||
+ 1,
|
||||
)
|
||||
]
|
||||
),
|
||||
resource_header="Resources "
|
||||
+ "".join(
|
||||
[
|
||||
"-"
|
||||
for _ in range(
|
||||
10, domains_count_length + cpu_count_length + load_length + 1
|
||||
)
|
||||
]
|
||||
),
|
||||
memory_header="Memory (M) "
|
||||
+ "".join(
|
||||
[
|
||||
"-"
|
||||
for _ in range(
|
||||
11,
|
||||
mem_total_length
|
||||
+ mem_used_length
|
||||
+ mem_free_length
|
||||
+ mem_alloc_length
|
||||
+ mem_prov_length
|
||||
+ 3,
|
||||
)
|
||||
]
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
node_list_output.append(
|
||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {node_health: <{health_length}} \
|
||||
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
||||
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
||||
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {node_mem_allocated: <{mem_alloc_length}} {node_mem_provisioned: <{mem_prov_length}}{end_bold}".format(
|
||||
node_name_length=node_name_length,
|
||||
pvc_version_length=pvc_version_length,
|
||||
health_length=health_length,
|
||||
daemon_state_length=daemon_state_length,
|
||||
coordinator_state_length=coordinator_state_length,
|
||||
domain_state_length=domain_state_length,
|
||||
domains_count_length=domains_count_length,
|
||||
cpu_count_length=cpu_count_length,
|
||||
load_length=load_length,
|
||||
mem_total_length=mem_total_length,
|
||||
mem_used_length=mem_used_length,
|
||||
mem_free_length=mem_free_length,
|
||||
mem_alloc_length=mem_alloc_length,
|
||||
mem_prov_length=mem_prov_length,
|
||||
bold=ansiprint.bold(),
|
||||
end_bold=ansiprint.end(),
|
||||
daemon_state_colour="",
|
||||
coordinator_state_colour="",
|
||||
domain_state_colour="",
|
||||
end_colour="",
|
||||
node_name="Name",
|
||||
node_pvc_version="Version",
|
||||
node_health="Health",
|
||||
node_daemon_state="Daemon",
|
||||
node_coordinator_state="Coordinator",
|
||||
node_domain_state="Domain",
|
||||
node_domains_count="VMs",
|
||||
node_cpu_count="vCPUs",
|
||||
node_load="Load",
|
||||
node_mem_total="Total",
|
||||
node_mem_used="Used",
|
||||
node_mem_free="Free",
|
||||
node_mem_allocated="Alloc",
|
||||
node_mem_provisioned="Prov",
|
||||
)
|
||||
)
|
||||
|
||||
# Format the string (elements)
|
||||
for node_information in sorted(node_list, key=lambda n: n["name"]):
|
||||
(
|
||||
health_colour,
|
||||
daemon_state_colour,
|
||||
coordinator_state_colour,
|
||||
domain_state_colour,
|
||||
mem_allocated_colour,
|
||||
mem_provisioned_colour,
|
||||
) = getOutputColours(node_information)
|
||||
|
||||
node_health = node_information.get("health", "N/A")
|
||||
if isinstance(node_health, int):
|
||||
node_health_text = f"{node_health}%"
|
||||
else:
|
||||
node_health_text = node_health
|
||||
|
||||
node_list_output.append(
|
||||
"{bold}{node_name: <{node_name_length}} {node_pvc_version: <{pvc_version_length}} {health_colour}{node_health: <{health_length}}{end_colour} \
|
||||
{daemon_state_colour}{node_daemon_state: <{daemon_state_length}}{end_colour} {coordinator_state_colour}{node_coordinator_state: <{coordinator_state_length}}{end_colour} {domain_state_colour}{node_domain_state: <{domain_state_length}}{end_colour} \
|
||||
{node_domains_count: <{domains_count_length}} {node_cpu_count: <{cpu_count_length}} {node_load: <{load_length}} \
|
||||
{node_mem_total: <{mem_total_length}} {node_mem_used: <{mem_used_length}} {node_mem_free: <{mem_free_length}} {mem_allocated_colour}{node_mem_allocated: <{mem_alloc_length}}{end_colour} {mem_provisioned_colour}{node_mem_provisioned: <{mem_prov_length}}{end_colour}{end_bold}".format(
|
||||
node_name_length=node_name_length,
|
||||
pvc_version_length=pvc_version_length,
|
||||
health_length=health_length,
|
||||
daemon_state_length=daemon_state_length,
|
||||
coordinator_state_length=coordinator_state_length,
|
||||
domain_state_length=domain_state_length,
|
||||
domains_count_length=domains_count_length,
|
||||
cpu_count_length=cpu_count_length,
|
||||
load_length=load_length,
|
||||
mem_total_length=mem_total_length,
|
||||
mem_used_length=mem_used_length,
|
||||
mem_free_length=mem_free_length,
|
||||
mem_alloc_length=mem_alloc_length,
|
||||
mem_prov_length=mem_prov_length,
|
||||
bold="",
|
||||
end_bold="",
|
||||
health_colour=health_colour,
|
||||
daemon_state_colour=daemon_state_colour,
|
||||
coordinator_state_colour=coordinator_state_colour,
|
||||
domain_state_colour=domain_state_colour,
|
||||
mem_allocated_colour=mem_allocated_colour,
|
||||
mem_provisioned_colour=mem_allocated_colour,
|
||||
end_colour=ansiprint.end(),
|
||||
node_name=node_information["name"],
|
||||
node_pvc_version=node_information.get("pvc_version", "N/A"),
|
||||
node_health=node_health_text,
|
||||
node_daemon_state=node_information["daemon_state"],
|
||||
node_coordinator_state=node_information["coordinator_state"],
|
||||
node_domain_state=node_information["domain_state"],
|
||||
node_domains_count=node_information["domains_count"],
|
||||
node_cpu_count=node_information["vcpu"]["allocated"],
|
||||
node_load=node_information["load"],
|
||||
node_mem_total=node_information["memory"]["total"],
|
||||
node_mem_used=node_information["memory"]["used"],
|
||||
node_mem_free=node_information["memory"]["free"],
|
||||
node_mem_allocated=node_information["memory"]["allocated"],
|
||||
node_mem_provisioned=node_information["memory"]["provisioned"],
|
||||
)
|
||||
)
|
||||
|
||||
return "\n".join(node_list_output)
|
2019
client-cli/pvc/lib/provisioner.py
Normal file
2601
client-cli/pvc/lib/storage.py
Normal file
2040
client-cli/pvc/lib/vm.py
Normal file
102
client-cli/pvc/lib/zkhandler.py
Normal file
@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# zkhandler.py - Secure versioned ZooKeeper updates
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import uuid
|
||||
|
||||
|
||||
# Exists function
|
||||
def exists(zk_conn, key):
|
||||
stat = zk_conn.exists(key)
|
||||
if stat:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
# Child list function
|
||||
def listchildren(zk_conn, key):
|
||||
children = zk_conn.get_children(key)
|
||||
return children
|
||||
|
||||
|
||||
# Delete key function
|
||||
def deletekey(zk_conn, key, recursive=True):
|
||||
zk_conn.delete(key, recursive=recursive)
|
||||
|
||||
|
||||
# Data read function
|
||||
def readdata(zk_conn, key):
|
||||
data_raw = zk_conn.get(key)
|
||||
data = data_raw[0].decode("utf8")
|
||||
return data
|
||||
|
||||
|
||||
# Data write function
|
||||
def writedata(zk_conn, kv):
|
||||
# Start up a transaction
|
||||
zk_transaction = zk_conn.transaction()
|
||||
|
||||
# Proceed one KV pair at a time
|
||||
for key in sorted(kv):
|
||||
data = kv[key]
|
||||
|
||||
# Check if this key already exists or not
|
||||
if not zk_conn.exists(key):
|
||||
# We're creating a new key
|
||||
zk_transaction.create(key, str(data).encode("utf8"))
|
||||
else:
|
||||
# We're updating a key with version validation
|
||||
orig_data = zk_conn.get(key)
|
||||
version = orig_data[1].version
|
||||
|
||||
# Set what we expect the new version to be
|
||||
new_version = version + 1
|
||||
|
||||
# Update the data
|
||||
zk_transaction.set_data(key, str(data).encode("utf8"))
|
||||
|
||||
# Set up the check
|
||||
try:
|
||||
zk_transaction.check(key, new_version)
|
||||
except TypeError:
|
||||
print('Zookeeper key "{}" does not match expected version'.format(key))
|
||||
return False
|
||||
|
||||
# Commit the transaction
|
||||
try:
|
||||
zk_transaction.commit()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# Write lock function
|
||||
def writelock(zk_conn, key):
|
||||
lock_id = str(uuid.uuid1())
|
||||
lock = zk_conn.WriteLock("{}".format(key), lock_id)
|
||||
return lock
|
||||
|
||||
|
||||
# Read lock function
|
||||
def readlock(zk_conn, key):
|
||||
lock_id = str(uuid.uuid1())
|
||||
lock = zk_conn.ReadLock("{}".format(key), lock_id)
|
||||
return lock
|
@ -2,8 +2,8 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="pvc",
|
||||
version="0.9.58",
|
||||
packages=["pvc", "pvc.cli_lib"],
|
||||
version="0.9.76",
|
||||
packages=["pvc.cli", "pvc.lib"],
|
||||
install_requires=[
|
||||
"Click",
|
||||
"PyYAML",
|
||||
@ -14,7 +14,7 @@ setup(
|
||||
],
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"pvc = pvc.pvc:cli",
|
||||
"pvc = pvc.cli.cli:cli",
|
||||
],
|
||||
},
|
||||
)
|
||||
|
@ -35,6 +35,7 @@ import daemon_lib.common as common
|
||||
# Supplemental functions
|
||||
#
|
||||
|
||||
|
||||
# Verify OSD is valid in cluster
|
||||
def verifyOSD(zkhandler, osd_id):
|
||||
return zkhandler.exists(("osd", osd_id))
|
||||
@ -73,6 +74,11 @@ byte_unit_matrix = {
|
||||
"G": 1024 * 1024 * 1024,
|
||||
"T": 1024 * 1024 * 1024 * 1024,
|
||||
"P": 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
"E": 1024 * 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
"Z": 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
"Y": 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
"R": 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
"Q": 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024 * 1024,
|
||||
}
|
||||
|
||||
# Matrix of human-to-metric values
|
||||
@ -83,6 +89,11 @@ ops_unit_matrix = {
|
||||
"G": 1000 * 1000 * 1000,
|
||||
"T": 1000 * 1000 * 1000 * 1000,
|
||||
"P": 1000 * 1000 * 1000 * 1000 * 1000,
|
||||
"E": 1000 * 1000 * 1000 * 1000 * 1000 * 1000,
|
||||
"Z": 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000,
|
||||
"Y": 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000,
|
||||
"R": 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000,
|
||||
"Q": 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000 * 1000,
|
||||
}
|
||||
|
||||
|
||||
@ -103,14 +114,18 @@ def format_bytes_tohuman(databytes):
|
||||
|
||||
|
||||
def format_bytes_fromhuman(datahuman):
|
||||
# Trim off human-readable character
|
||||
dataunit = str(datahuman)[-1]
|
||||
datasize = int(str(datahuman)[:-1])
|
||||
if not re.match(r"[A-Z]", dataunit):
|
||||
if not re.search(r"[A-Za-z]+", datahuman):
|
||||
dataunit = "B"
|
||||
datasize = int(datahuman)
|
||||
databytes = datasize * byte_unit_matrix[dataunit]
|
||||
return databytes
|
||||
else:
|
||||
dataunit = str(re.match(r"[0-9]+([A-Za-z])[iBb]*", datahuman).group(1))
|
||||
datasize = int(re.match(r"([0-9]+)[A-Za-z]+", datahuman).group(1))
|
||||
|
||||
if byte_unit_matrix.get(dataunit):
|
||||
databytes = datasize * byte_unit_matrix[dataunit]
|
||||
return databytes
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# Format ops sizes to/from human-readable units
|
||||
@ -158,6 +173,19 @@ def get_status(zkhandler):
|
||||
return True, status_data
|
||||
|
||||
|
||||
def get_health(zkhandler):
|
||||
primary_node = zkhandler.read("base.config.primary_node")
|
||||
ceph_health = zkhandler.read("base.storage.health").rstrip()
|
||||
|
||||
# Create a data structure for the information
|
||||
status_data = {
|
||||
"type": "health",
|
||||
"primary_node": primary_node,
|
||||
"ceph_data": ceph_health,
|
||||
}
|
||||
return True, status_data
|
||||
|
||||
|
||||
def get_util(zkhandler):
|
||||
primary_node = zkhandler.read("base.config.primary_node")
|
||||
ceph_df = zkhandler.read("base.storage.util").rstrip()
|
||||
@ -718,22 +746,26 @@ def getVolumeInformation(zkhandler, pool, volume):
|
||||
|
||||
|
||||
def add_volume(zkhandler, pool, name, size):
|
||||
# Add 'B' if the volume is in bytes
|
||||
if re.match(r"^[0-9]+$", size):
|
||||
size = "{}B".format(size)
|
||||
|
||||
# 1. Verify the size of the volume
|
||||
pool_information = getPoolInformation(zkhandler, pool)
|
||||
size_bytes = format_bytes_fromhuman(size)
|
||||
if size_bytes is None:
|
||||
return (
|
||||
False,
|
||||
f"ERROR: Requested volume size '{size}' does not have a valid SI unit",
|
||||
)
|
||||
|
||||
if size_bytes >= int(pool_information["stats"]["free_bytes"]):
|
||||
return (
|
||||
False,
|
||||
"ERROR: Requested volume size is greater than the available free space in the pool",
|
||||
f"ERROR: Requested volume size '{format_bytes_tohuman(size_bytes)}' is greater than the available free space in the pool ('{format_bytes_tohuman(pool_information['stats']['free_bytes'])}')",
|
||||
)
|
||||
|
||||
# 2. Create the volume
|
||||
retcode, stdout, stderr = common.run_os_command(
|
||||
"rbd create --size {} {}/{}".format(size, pool, name)
|
||||
"rbd create --size {} {}/{}".format(
|
||||
format_bytes_tohuman(size_bytes), pool, name
|
||||
)
|
||||
)
|
||||
if retcode:
|
||||
return False, 'ERROR: Failed to create RBD volume "{}": {}'.format(name, stderr)
|
||||
@ -753,7 +785,9 @@ def add_volume(zkhandler, pool, name, size):
|
||||
]
|
||||
)
|
||||
|
||||
return True, 'Created RBD volume "{}/{}" ({}).'.format(pool, name, size)
|
||||
return True, 'Created RBD volume "{}" of size "{}" in pool "{}".'.format(
|
||||
name, format_bytes_tohuman(size_bytes), pool
|
||||
)
|
||||
|
||||
|
||||
def clone_volume(zkhandler, pool, name_src, name_new):
|
||||
@ -800,28 +834,32 @@ def resize_volume(zkhandler, pool, name, size):
|
||||
name, pool
|
||||
)
|
||||
|
||||
# Add 'B' if the volume is in bytes
|
||||
if re.match(r"^[0-9]+$", size):
|
||||
size = "{}B".format(size)
|
||||
|
||||
# 1. Verify the size of the volume
|
||||
pool_information = getPoolInformation(zkhandler, pool)
|
||||
size_bytes = format_bytes_fromhuman(size)
|
||||
if size_bytes is None:
|
||||
return (
|
||||
False,
|
||||
f"ERROR: Requested volume size '{size}' does not have a valid SI unit",
|
||||
)
|
||||
|
||||
if size_bytes >= int(pool_information["stats"]["free_bytes"]):
|
||||
return (
|
||||
False,
|
||||
"ERROR: Requested volume size is greater than the available free space in the pool",
|
||||
f"ERROR: Requested volume size '{format_bytes_tohuman(size_bytes)}' is greater than the available free space in the pool ('{format_bytes_tohuman(pool_information['stats']['free_bytes'])}')",
|
||||
)
|
||||
|
||||
# 2. Resize the volume
|
||||
retcode, stdout, stderr = common.run_os_command(
|
||||
"rbd resize --size {} {}/{}".format(size, pool, name)
|
||||
"rbd resize --size {} {}/{}".format(
|
||||
format_bytes_tohuman(size_bytes), pool, name
|
||||
)
|
||||
)
|
||||
if retcode:
|
||||
return (
|
||||
False,
|
||||
'ERROR: Failed to resize RBD volume "{}" to size "{}" in pool "{}": {}'.format(
|
||||
name, size, pool, stderr
|
||||
name, format_bytes_tohuman(size_bytes), pool, stderr
|
||||
),
|
||||
)
|
||||
|
||||
@ -847,7 +885,7 @@ def resize_volume(zkhandler, pool, name, size):
|
||||
if target_vm_conn:
|
||||
target_vm_conn.blockResize(
|
||||
volume_id,
|
||||
format_bytes_fromhuman(size),
|
||||
size_bytes,
|
||||
libvirt.VIR_DOMAIN_BLOCK_RESIZE_BYTES,
|
||||
)
|
||||
target_lv_conn.close()
|
||||
@ -870,7 +908,7 @@ def resize_volume(zkhandler, pool, name, size):
|
||||
)
|
||||
|
||||
return True, 'Resized RBD volume "{}" to size "{}" in pool "{}".'.format(
|
||||
name, size, pool
|
||||
name, format_bytes_tohuman(size_bytes), pool
|
||||
)
|
||||
|
||||
|
||||
|
@ -19,7 +19,7 @@
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
import re
|
||||
from json import loads
|
||||
|
||||
import daemon_lib.common as common
|
||||
import daemon_lib.vm as pvc_vm
|
||||
@ -44,17 +44,179 @@ def set_maintenance(zkhandler, maint_state):
|
||||
return True, "Successfully set cluster in normal mode"
|
||||
|
||||
|
||||
def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list):
|
||||
health_delta_map = {
|
||||
"node_stopped": 50,
|
||||
"node_flushed": 10,
|
||||
"vm_stopped": 10,
|
||||
"osd_out": 50,
|
||||
"osd_down": 10,
|
||||
"osd_full": 50,
|
||||
"osd_nearfull": 10,
|
||||
"memory_overprovisioned": 50,
|
||||
"ceph_err": 50,
|
||||
"ceph_warn": 10,
|
||||
}
|
||||
|
||||
# Generate total cluster health numbers
|
||||
cluster_health_value = 100
|
||||
cluster_health_messages = list()
|
||||
|
||||
for index, node in enumerate(node_list):
|
||||
# Apply node health values to total health number
|
||||
try:
|
||||
node_health_int = int(node["health"])
|
||||
except Exception:
|
||||
node_health_int = 100
|
||||
cluster_health_value -= 100 - node_health_int
|
||||
|
||||
for entry in node["health_details"]:
|
||||
if entry["health_delta"] > 0:
|
||||
cluster_health_messages.append(
|
||||
f"{node['name']}: plugin '{entry['name']}': {entry['message']}"
|
||||
)
|
||||
|
||||
# Handle unhealthy node states
|
||||
if node["daemon_state"] not in ["run"]:
|
||||
cluster_health_value -= health_delta_map["node_stopped"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Node {node['name']} in {node['daemon_state'].upper()} daemon state"
|
||||
)
|
||||
elif node["domain_state"] not in ["ready"]:
|
||||
cluster_health_value -= health_delta_map["node_flushed"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Node {node['name']} in {node['domain_state'].upper()} domain state"
|
||||
)
|
||||
|
||||
for index, vm in enumerate(vm_list):
|
||||
# Handle unhealthy VM states
|
||||
if vm["state"] in ["stop", "fail"]:
|
||||
cluster_health_value -= health_delta_map["vm_stopped"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: VM {vm['name']} in {vm['state'].upper()} state"
|
||||
)
|
||||
|
||||
for index, ceph_osd in enumerate(ceph_osd_list):
|
||||
in_texts = {1: "in", 0: "out"}
|
||||
up_texts = {1: "up", 0: "down"}
|
||||
|
||||
# Handle unhealthy OSD states
|
||||
if in_texts[ceph_osd["stats"]["in"]] not in ["in"]:
|
||||
cluster_health_value -= health_delta_map["osd_out"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Ceph OSD {ceph_osd['id']} in {in_texts[ceph_osd['stats']['in']].upper()} state"
|
||||
)
|
||||
elif up_texts[ceph_osd["stats"]["up"]] not in ["up"]:
|
||||
cluster_health_value -= health_delta_map["osd_down"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state"
|
||||
)
|
||||
|
||||
# Handle full or nearfull OSDs (>85%)
|
||||
if ceph_osd["stats"]["utilization"] >= 90:
|
||||
cluster_health_value -= health_delta_map["osd_full"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Ceph OSD {ceph_osd['id']} is FULL ({ceph_osd['stats']['utilization']:.1f}% > 90%)"
|
||||
)
|
||||
elif ceph_osd["stats"]["utilization"] >= 85:
|
||||
cluster_health_value -= health_delta_map["osd_nearfull"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Ceph OSD {ceph_osd['id']} is NEARFULL ({ceph_osd['stats']['utilization']:.1f}% > 85%)"
|
||||
)
|
||||
|
||||
# Check for (n-1) overprovisioning
|
||||
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
|
||||
# the total memory of the (n-1) smallest nodes, trigger this warning.
|
||||
n_minus_1_total = 0
|
||||
alloc_total = 0
|
||||
node_largest_index = None
|
||||
node_largest_count = 0
|
||||
for index, node in enumerate(node_list):
|
||||
node_mem_total = node["memory"]["total"]
|
||||
node_mem_alloc = node["memory"]["allocated"]
|
||||
alloc_total += node_mem_alloc
|
||||
# Determine if this node is the largest seen so far
|
||||
if node_mem_total > node_largest_count:
|
||||
node_largest_index = index
|
||||
node_largest_count = node_mem_total
|
||||
n_minus_1_node_list = list()
|
||||
for index, node in enumerate(node_list):
|
||||
if index == node_largest_index:
|
||||
continue
|
||||
n_minus_1_node_list.append(node)
|
||||
for index, node in enumerate(n_minus_1_node_list):
|
||||
n_minus_1_total += node["memory"]["total"]
|
||||
if alloc_total > n_minus_1_total:
|
||||
cluster_health_value -= health_delta_map["memory_overprovisioned"]
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Total memory is OVERPROVISIONED ({alloc_total} > {n_minus_1_total} @ N-1)"
|
||||
)
|
||||
|
||||
# Check Ceph cluster health
|
||||
ceph_health = loads(zkhandler.read("base.storage.health"))
|
||||
ceph_health_status = ceph_health["status"]
|
||||
ceph_health_entries = ceph_health["checks"].keys()
|
||||
|
||||
ceph_health_status_map = {
|
||||
"HEALTH_ERR": "ERROR",
|
||||
"HEALTH_WARN": "WARNING",
|
||||
}
|
||||
for entry in ceph_health_entries:
|
||||
cluster_health_messages.append(
|
||||
f"cluster: Ceph {ceph_health_status_map[ceph_health['checks'][entry]['severity']]} {entry}: {ceph_health['checks'][entry]['summary']['message']}"
|
||||
)
|
||||
|
||||
if ceph_health_status == "HEALTH_ERR":
|
||||
cluster_health_value -= health_delta_map["ceph_err"]
|
||||
elif ceph_health_status == "HEALTH_WARN":
|
||||
cluster_health_value -= health_delta_map["ceph_warn"]
|
||||
|
||||
if cluster_health_value < 0:
|
||||
cluster_health_value = 0
|
||||
|
||||
cluster_health = {
|
||||
"health": cluster_health_value,
|
||||
"messages": cluster_health_messages,
|
||||
}
|
||||
|
||||
return cluster_health
|
||||
|
||||
|
||||
def getNodeHealth(zkhandler, node_list):
|
||||
node_health = dict()
|
||||
for index, node in enumerate(node_list):
|
||||
node_health_messages = list()
|
||||
node_health_value = node["health"]
|
||||
for entry in node["health_details"]:
|
||||
if entry["health_delta"] > 0:
|
||||
node_health_messages.append(f"'{entry['name']}': {entry['message']}")
|
||||
|
||||
node_health_entry = {
|
||||
"health": node_health_value,
|
||||
"messages": node_health_messages,
|
||||
}
|
||||
|
||||
node_health[node["name"]] = node_health_entry
|
||||
|
||||
return node_health
|
||||
|
||||
|
||||
def getClusterInformation(zkhandler):
|
||||
# Get cluster maintenance state
|
||||
maint_state = zkhandler.read("base.config.maintenance")
|
||||
|
||||
# List of messages to display to the clients
|
||||
cluster_health_msg = []
|
||||
storage_health_msg = []
|
||||
maintenance_state = zkhandler.read("base.config.maintenance")
|
||||
|
||||
# Get node information object list
|
||||
retcode, node_list = pvc_node.get_list(zkhandler, None)
|
||||
|
||||
# Get primary node
|
||||
primary_node = common.getPrimaryNode(zkhandler)
|
||||
|
||||
# Get PVC version of primary node
|
||||
pvc_version = "0.0.0"
|
||||
for node in node_list:
|
||||
if node["name"] == primary_node:
|
||||
pvc_version = node["pvc_version"]
|
||||
|
||||
# Get vm information object list
|
||||
retcode, vm_list = pvc_vm.get_list(zkhandler, None, None, None, None)
|
||||
|
||||
@ -78,135 +240,6 @@ def getClusterInformation(zkhandler):
|
||||
ceph_volume_count = len(ceph_volume_list)
|
||||
ceph_snapshot_count = len(ceph_snapshot_list)
|
||||
|
||||
# Determinations for general cluster health
|
||||
cluster_healthy_status = True
|
||||
# Check for (n-1) overprovisioning
|
||||
# Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than
|
||||
# the total memory of the (n-1) smallest nodes, trigger this warning.
|
||||
n_minus_1_total = 0
|
||||
alloc_total = 0
|
||||
|
||||
node_largest_index = None
|
||||
node_largest_count = 0
|
||||
for index, node in enumerate(node_list):
|
||||
node_mem_total = node["memory"]["total"]
|
||||
node_mem_alloc = node["memory"]["allocated"]
|
||||
alloc_total += node_mem_alloc
|
||||
|
||||
# Determine if this node is the largest seen so far
|
||||
if node_mem_total > node_largest_count:
|
||||
node_largest_index = index
|
||||
node_largest_count = node_mem_total
|
||||
n_minus_1_node_list = list()
|
||||
for index, node in enumerate(node_list):
|
||||
if index == node_largest_index:
|
||||
continue
|
||||
n_minus_1_node_list.append(node)
|
||||
for index, node in enumerate(n_minus_1_node_list):
|
||||
n_minus_1_total += node["memory"]["total"]
|
||||
if alloc_total > n_minus_1_total:
|
||||
cluster_healthy_status = False
|
||||
cluster_health_msg.append(
|
||||
"Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(
|
||||
alloc_total, n_minus_1_total
|
||||
)
|
||||
)
|
||||
|
||||
# Determinations for node health
|
||||
node_healthy_status = list(range(0, node_count))
|
||||
node_report_status = list(range(0, node_count))
|
||||
for index, node in enumerate(node_list):
|
||||
daemon_state = node["daemon_state"]
|
||||
domain_state = node["domain_state"]
|
||||
if daemon_state != "run" and domain_state != "ready":
|
||||
node_healthy_status[index] = False
|
||||
cluster_health_msg.append(
|
||||
"Node '{}' in {},{} state".format(
|
||||
node["name"], daemon_state, domain_state
|
||||
)
|
||||
)
|
||||
else:
|
||||
node_healthy_status[index] = True
|
||||
node_report_status[index] = daemon_state + "," + domain_state
|
||||
|
||||
# Determinations for VM health
|
||||
vm_healthy_status = list(range(0, vm_count))
|
||||
vm_report_status = list(range(0, vm_count))
|
||||
for index, vm in enumerate(vm_list):
|
||||
vm_state = vm["state"]
|
||||
if vm_state not in ["start", "disable", "migrate", "unmigrate", "provision"]:
|
||||
vm_healthy_status[index] = False
|
||||
cluster_health_msg.append(
|
||||
"VM '{}' in {} state".format(vm["name"], vm_state)
|
||||
)
|
||||
else:
|
||||
vm_healthy_status[index] = True
|
||||
vm_report_status[index] = vm_state
|
||||
|
||||
# Determinations for OSD health
|
||||
ceph_osd_healthy_status = list(range(0, ceph_osd_count))
|
||||
ceph_osd_report_status = list(range(0, ceph_osd_count))
|
||||
for index, ceph_osd in enumerate(ceph_osd_list):
|
||||
try:
|
||||
ceph_osd_up = ceph_osd["stats"]["up"]
|
||||
except KeyError:
|
||||
ceph_osd_up = 0
|
||||
|
||||
try:
|
||||
ceph_osd_in = ceph_osd["stats"]["in"]
|
||||
except KeyError:
|
||||
ceph_osd_in = 0
|
||||
|
||||
up_texts = {1: "up", 0: "down"}
|
||||
in_texts = {1: "in", 0: "out"}
|
||||
|
||||
if not ceph_osd_up or not ceph_osd_in:
|
||||
ceph_osd_healthy_status[index] = False
|
||||
cluster_health_msg.append(
|
||||
"OSD {} in {},{} state".format(
|
||||
ceph_osd["id"], up_texts[ceph_osd_up], in_texts[ceph_osd_in]
|
||||
)
|
||||
)
|
||||
else:
|
||||
ceph_osd_healthy_status[index] = True
|
||||
ceph_osd_report_status[index] = (
|
||||
up_texts[ceph_osd_up] + "," + in_texts[ceph_osd_in]
|
||||
)
|
||||
|
||||
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
|
||||
if maint_state == "true":
|
||||
cluster_health = "Maintenance"
|
||||
elif (
|
||||
cluster_healthy_status is False
|
||||
or False in node_healthy_status
|
||||
or False in vm_healthy_status
|
||||
or False in ceph_osd_healthy_status
|
||||
):
|
||||
cluster_health = "Degraded"
|
||||
else:
|
||||
cluster_health = "Optimal"
|
||||
|
||||
# Find out our storage health from Ceph
|
||||
ceph_status = zkhandler.read("base.storage").split("\n")
|
||||
ceph_health = ceph_status[2].split()[-1]
|
||||
|
||||
# Parse the status output to get the health indicators
|
||||
line_record = False
|
||||
for index, line in enumerate(ceph_status):
|
||||
if re.search("services:", line):
|
||||
line_record = False
|
||||
if line_record and len(line.strip()) > 0:
|
||||
storage_health_msg.append(line.strip())
|
||||
if re.search("health:", line):
|
||||
line_record = True
|
||||
|
||||
if maint_state == "true":
|
||||
storage_health = "Maintenance"
|
||||
elif ceph_health != "HEALTH_OK":
|
||||
storage_health = "Degraded"
|
||||
else:
|
||||
storage_health = "Optimal"
|
||||
|
||||
# State lists
|
||||
node_state_combinations = [
|
||||
"run,ready",
|
||||
@ -223,8 +256,13 @@ def getClusterInformation(zkhandler):
|
||||
"stop,unflush",
|
||||
"dead,ready",
|
||||
"dead,flush",
|
||||
"dead,fence-flush",
|
||||
"dead,flushed",
|
||||
"dead,unflush",
|
||||
"fenced,ready",
|
||||
"fenced,flush",
|
||||
"fenced,flushed",
|
||||
"fenced,unflush",
|
||||
]
|
||||
vm_state_combinations = [
|
||||
"start",
|
||||
@ -237,13 +275,19 @@ def getClusterInformation(zkhandler):
|
||||
"unmigrate",
|
||||
"provision",
|
||||
]
|
||||
ceph_osd_state_combinations = ["up,in", "up,out", "down,in", "down,out"]
|
||||
ceph_osd_state_combinations = [
|
||||
"up,in",
|
||||
"up,out",
|
||||
"down,in",
|
||||
"down,out",
|
||||
]
|
||||
|
||||
# Format the Node states
|
||||
formatted_node_states = {"total": node_count}
|
||||
for state in node_state_combinations:
|
||||
state_count = 0
|
||||
for node_state in node_report_status:
|
||||
for node in node_list:
|
||||
node_state = f"{node['daemon_state']},{node['domain_state']}"
|
||||
if node_state == state:
|
||||
state_count += 1
|
||||
if state_count > 0:
|
||||
@ -253,17 +297,20 @@ def getClusterInformation(zkhandler):
|
||||
formatted_vm_states = {"total": vm_count}
|
||||
for state in vm_state_combinations:
|
||||
state_count = 0
|
||||
for vm_state in vm_report_status:
|
||||
if vm_state == state:
|
||||
for vm in vm_list:
|
||||
if vm["state"] == state:
|
||||
state_count += 1
|
||||
if state_count > 0:
|
||||
formatted_vm_states[state] = state_count
|
||||
|
||||
# Format the OSD states
|
||||
up_texts = {1: "up", 0: "down"}
|
||||
in_texts = {1: "in", 0: "out"}
|
||||
formatted_osd_states = {"total": ceph_osd_count}
|
||||
for state in ceph_osd_state_combinations:
|
||||
state_count = 0
|
||||
for ceph_osd_state in ceph_osd_report_status:
|
||||
for ceph_osd in ceph_osd_list:
|
||||
ceph_osd_state = f"{up_texts[ceph_osd['stats']['up']]},{in_texts[ceph_osd['stats']['in']]}"
|
||||
if ceph_osd_state == state:
|
||||
state_count += 1
|
||||
if state_count > 0:
|
||||
@ -271,11 +318,13 @@ def getClusterInformation(zkhandler):
|
||||
|
||||
# Format the status data
|
||||
cluster_information = {
|
||||
"health": cluster_health,
|
||||
"health_msg": cluster_health_msg,
|
||||
"storage_health": storage_health,
|
||||
"storage_health_msg": storage_health_msg,
|
||||
"primary_node": common.getPrimaryNode(zkhandler),
|
||||
"cluster_health": getClusterHealth(
|
||||
zkhandler, node_list, vm_list, ceph_osd_list
|
||||
),
|
||||
"node_health": getNodeHealth(zkhandler, node_list),
|
||||
"maintenance": maintenance_state,
|
||||
"primary_node": primary_node,
|
||||
"pvc_version": pvc_version,
|
||||
"upstream_ip": zkhandler.read("base.config.upstream_ip"),
|
||||
"nodes": formatted_node_states,
|
||||
"vms": formatted_vm_states,
|
||||
|
@ -37,6 +37,7 @@ from functools import wraps
|
||||
# Performance Profiler decorator
|
||||
###############################################################################
|
||||
|
||||
|
||||
# Get performance statistics on a function or class
|
||||
class Profiler(object):
|
||||
def __init__(self, config):
|
||||
@ -104,6 +105,7 @@ class Profiler(object):
|
||||
# Supplemental functions
|
||||
###############################################################################
|
||||
|
||||
|
||||
#
|
||||
# Run a local OS daemon in the background
|
||||
#
|
||||
@ -638,9 +640,9 @@ def findTargetNode(zkhandler, dom_uuid):
|
||||
|
||||
# Execute the search
|
||||
if search_field == "mem":
|
||||
return findTargetNodeMem(zkhandler, node_limit, dom_uuid)
|
||||
if search_field == "memfree":
|
||||
return findTargetNodeMemFree(zkhandler, node_limit, dom_uuid)
|
||||
if search_field == "memprov":
|
||||
return findTargetNodeMemProv(zkhandler, node_limit, dom_uuid)
|
||||
if search_field == "load":
|
||||
return findTargetNodeLoad(zkhandler, node_limit, dom_uuid)
|
||||
if search_field == "vcpus":
|
||||
@ -678,10 +680,28 @@ def getNodes(zkhandler, node_limit, dom_uuid):
|
||||
return valid_node_list
|
||||
|
||||
|
||||
#
|
||||
# via free memory
|
||||
#
|
||||
def findTargetNodeMemFree(zkhandler, node_limit, dom_uuid):
|
||||
most_memfree = 0
|
||||
target_node = None
|
||||
|
||||
node_list = getNodes(zkhandler, node_limit, dom_uuid)
|
||||
for node in node_list:
|
||||
memfree = int(zkhandler.read(("node.memory.free", node)))
|
||||
|
||||
if memfree > most_memfree:
|
||||
most_memfree = memfree
|
||||
target_node = node
|
||||
|
||||
return target_node
|
||||
|
||||
|
||||
#
|
||||
# via provisioned memory
|
||||
#
|
||||
def findTargetNodeMem(zkhandler, node_limit, dom_uuid):
|
||||
def findTargetNodeMemProv(zkhandler, node_limit, dom_uuid):
|
||||
most_provfree = 0
|
||||
target_node = None
|
||||
|
||||
@ -700,24 +720,6 @@ def findTargetNodeMem(zkhandler, node_limit, dom_uuid):
|
||||
return target_node
|
||||
|
||||
|
||||
#
|
||||
# via free memory
|
||||
#
|
||||
def findTargetNodeMemFree(zkhandler, node_limit, dom_uuid):
|
||||
most_memfree = 0
|
||||
target_node = None
|
||||
|
||||
node_list = getNodes(zkhandler, node_limit, dom_uuid)
|
||||
for node in node_list:
|
||||
memfree = int(zkhandler.read(("node.memory.free", node)))
|
||||
|
||||
if memfree > most_memfree:
|
||||
most_memfree = memfree
|
||||
target_node = node
|
||||
|
||||
return target_node
|
||||
|
||||
|
||||
#
|
||||
# via load average
|
||||
#
|
||||
|
@ -112,7 +112,6 @@ class Logger(object):
|
||||
|
||||
# Output function
|
||||
def out(self, message, state=None, prefix=""):
|
||||
|
||||
# Get the date
|
||||
if self.config["log_dates"]:
|
||||
date = "{} ".format(datetime.now().strftime("%Y/%m/%d %H:%M:%S.%f"))
|
||||
|
1
daemon-common/migrations/versions/9.json
Normal file
@ -0,0 +1 @@
|
||||
{"version": "9", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}
|
@ -21,6 +21,7 @@
|
||||
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
|
||||
import daemon_lib.common as common
|
||||
|
||||
@ -49,6 +50,44 @@ def getNodeInformation(zkhandler, node_name):
|
||||
zkhandler.read(("node.count.provisioned_domains", node_name))
|
||||
)
|
||||
node_running_domains = zkhandler.read(("node.running_domains", node_name)).split()
|
||||
try:
|
||||
node_health = int(zkhandler.read(("node.monitoring.health", node_name)))
|
||||
except Exception:
|
||||
node_health = "N/A"
|
||||
try:
|
||||
node_health_plugins = zkhandler.read(
|
||||
("node.monitoring.plugins", node_name)
|
||||
).split()
|
||||
except Exception:
|
||||
node_health_plugins = list()
|
||||
|
||||
node_health_details = list()
|
||||
for plugin in node_health_plugins:
|
||||
plugin_last_run = zkhandler.read(
|
||||
("node.monitoring.data", node_name, "monitoring_plugin.last_run", plugin)
|
||||
)
|
||||
plugin_health_delta = zkhandler.read(
|
||||
(
|
||||
"node.monitoring.data",
|
||||
node_name,
|
||||
"monitoring_plugin.health_delta",
|
||||
plugin,
|
||||
)
|
||||
)
|
||||
plugin_message = zkhandler.read(
|
||||
("node.monitoring.data", node_name, "monitoring_plugin.message", plugin)
|
||||
)
|
||||
plugin_data = zkhandler.read(
|
||||
("node.monitoring.data", node_name, "monitoring_plugin.data", plugin)
|
||||
)
|
||||
plugin_output = {
|
||||
"name": plugin,
|
||||
"last_run": int(plugin_last_run),
|
||||
"health_delta": int(plugin_health_delta),
|
||||
"message": plugin_message,
|
||||
"data": json.loads(plugin_data),
|
||||
}
|
||||
node_health_details.append(plugin_output)
|
||||
|
||||
# Construct a data structure to represent the data
|
||||
node_information = {
|
||||
@ -61,10 +100,16 @@ def getNodeInformation(zkhandler, node_name):
|
||||
"kernel": node_kernel,
|
||||
"os": node_os,
|
||||
"arch": node_arch,
|
||||
"health": node_health,
|
||||
"health_plugins": node_health_plugins,
|
||||
"health_details": node_health_details,
|
||||
"load": node_load,
|
||||
"domains_count": node_domains_count,
|
||||
"running_domains": node_running_domains,
|
||||
"vcpu": {"total": node_cpu_count, "allocated": node_vcpu_allocated},
|
||||
"vcpu": {
|
||||
"total": node_cpu_count,
|
||||
"allocated": node_vcpu_allocated,
|
||||
},
|
||||
"memory": {
|
||||
"total": node_mem_total,
|
||||
"allocated": node_mem_allocated,
|
||||
@ -82,16 +127,14 @@ def getNodeInformation(zkhandler, node_name):
|
||||
def secondary_node(zkhandler, node):
|
||||
# Verify node is valid
|
||||
if not common.verifyNode(zkhandler, node):
|
||||
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
|
||||
node
|
||||
)
|
||||
return False, "ERROR: No node named {} is present in the cluster.".format(node)
|
||||
|
||||
# Ensure node is a coordinator
|
||||
daemon_mode = zkhandler.read(("node.mode", node))
|
||||
if daemon_mode == "hypervisor":
|
||||
return (
|
||||
False,
|
||||
'ERROR: Cannot change coordinator mode on non-coordinator node "{}"'.format(
|
||||
"ERROR: Cannot change coordinator state on non-coordinator node {}".format(
|
||||
node
|
||||
),
|
||||
)
|
||||
@ -99,14 +142,14 @@ def secondary_node(zkhandler, node):
|
||||
# Ensure node is in run daemonstate
|
||||
daemon_state = zkhandler.read(("node.state.daemon", node))
|
||||
if daemon_state != "run":
|
||||
return False, 'ERROR: Node "{}" is not active'.format(node)
|
||||
return False, "ERROR: Node {} is not active".format(node)
|
||||
|
||||
# Get current state
|
||||
current_state = zkhandler.read(("node.state.router", node))
|
||||
if current_state == "secondary":
|
||||
return True, 'Node "{}" is already in secondary coordinator mode.'.format(node)
|
||||
return True, "Node {} is already in secondary coordinator state.".format(node)
|
||||
|
||||
retmsg = "Setting node {} in secondary coordinator mode.".format(node)
|
||||
retmsg = "Setting node {} in secondary coordinator state.".format(node)
|
||||
zkhandler.write([("base.config.primary_node", "none")])
|
||||
|
||||
return True, retmsg
|
||||
@ -115,16 +158,14 @@ def secondary_node(zkhandler, node):
|
||||
def primary_node(zkhandler, node):
|
||||
# Verify node is valid
|
||||
if not common.verifyNode(zkhandler, node):
|
||||
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
|
||||
node
|
||||
)
|
||||
return False, "ERROR: No node named {} is present in the cluster.".format(node)
|
||||
|
||||
# Ensure node is a coordinator
|
||||
daemon_mode = zkhandler.read(("node.mode", node))
|
||||
if daemon_mode == "hypervisor":
|
||||
return (
|
||||
False,
|
||||
'ERROR: Cannot change coordinator mode on non-coordinator node "{}"'.format(
|
||||
"ERROR: Cannot change coordinator state on non-coordinator node {}".format(
|
||||
node
|
||||
),
|
||||
)
|
||||
@ -132,14 +173,14 @@ def primary_node(zkhandler, node):
|
||||
# Ensure node is in run daemonstate
|
||||
daemon_state = zkhandler.read(("node.state.daemon", node))
|
||||
if daemon_state != "run":
|
||||
return False, 'ERROR: Node "{}" is not active'.format(node)
|
||||
return False, "ERROR: Node {} is not active".format(node)
|
||||
|
||||
# Get current state
|
||||
current_state = zkhandler.read(("node.state.router", node))
|
||||
if current_state == "primary":
|
||||
return True, 'Node "{}" is already in primary coordinator mode.'.format(node)
|
||||
return True, "Node {} is already in primary coordinator state.".format(node)
|
||||
|
||||
retmsg = "Setting node {} in primary coordinator mode.".format(node)
|
||||
retmsg = "Setting node {} in primary coordinator state.".format(node)
|
||||
zkhandler.write([("base.config.primary_node", node)])
|
||||
|
||||
return True, retmsg
|
||||
@ -148,14 +189,12 @@ def primary_node(zkhandler, node):
|
||||
def flush_node(zkhandler, node, wait=False):
|
||||
# Verify node is valid
|
||||
if not common.verifyNode(zkhandler, node):
|
||||
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
|
||||
node
|
||||
)
|
||||
return False, "ERROR: No node named {} is present in the cluster.".format(node)
|
||||
|
||||
if zkhandler.read(("node.state.domain", node)) == "flushed":
|
||||
return True, "Hypervisor {} is already flushed.".format(node)
|
||||
return True, "Node {} is already flushed.".format(node)
|
||||
|
||||
retmsg = "Flushing hypervisor {} of running VMs.".format(node)
|
||||
retmsg = "Removing node {} from active service.".format(node)
|
||||
|
||||
# Add the new domain to Zookeeper
|
||||
zkhandler.write([(("node.state.domain", node), "flush")])
|
||||
@ -163,7 +202,7 @@ def flush_node(zkhandler, node, wait=False):
|
||||
if wait:
|
||||
while zkhandler.read(("node.state.domain", node)) == "flush":
|
||||
time.sleep(1)
|
||||
retmsg = "Flushed hypervisor {} of running VMs.".format(node)
|
||||
retmsg = "Removed node {} from active service.".format(node)
|
||||
|
||||
return True, retmsg
|
||||
|
||||
@ -171,14 +210,12 @@ def flush_node(zkhandler, node, wait=False):
|
||||
def ready_node(zkhandler, node, wait=False):
|
||||
# Verify node is valid
|
||||
if not common.verifyNode(zkhandler, node):
|
||||
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
|
||||
node
|
||||
)
|
||||
return False, "ERROR: No node named {} is present in the cluster.".format(node)
|
||||
|
||||
if zkhandler.read(("node.state.domain", node)) == "ready":
|
||||
return True, "Hypervisor {} is already ready.".format(node)
|
||||
return True, "Node {} is already ready.".format(node)
|
||||
|
||||
retmsg = "Restoring hypervisor {} to active service.".format(node)
|
||||
retmsg = "Restoring node {} to active service.".format(node)
|
||||
|
||||
# Add the new domain to Zookeeper
|
||||
zkhandler.write([(("node.state.domain", node), "unflush")])
|
||||
@ -186,7 +223,7 @@ def ready_node(zkhandler, node, wait=False):
|
||||
if wait:
|
||||
while zkhandler.read(("node.state.domain", node)) == "unflush":
|
||||
time.sleep(1)
|
||||
retmsg = "Restored hypervisor {} to active service.".format(node)
|
||||
retmsg = "Restored node {} to active service.".format(node)
|
||||
|
||||
return True, retmsg
|
||||
|
||||
@ -194,9 +231,7 @@ def ready_node(zkhandler, node, wait=False):
|
||||
def get_node_log(zkhandler, node, lines=2000):
|
||||
# Verify node is valid
|
||||
if not common.verifyNode(zkhandler, node):
|
||||
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
|
||||
node
|
||||
)
|
||||
return False, "ERROR: No node named {} is present in the cluster.".format(node)
|
||||
|
||||
# Get the data from ZK
|
||||
node_log = zkhandler.read(("logs.messages", node))
|
||||
@ -214,14 +249,12 @@ def get_node_log(zkhandler, node, lines=2000):
|
||||
def get_info(zkhandler, node):
|
||||
# Verify node is valid
|
||||
if not common.verifyNode(zkhandler, node):
|
||||
return False, 'ERROR: No node named "{}" is present in the cluster.'.format(
|
||||
node
|
||||
)
|
||||
return False, "ERROR: No node named {} is present in the cluster.".format(node)
|
||||
|
||||
# Get information about node in a pretty format
|
||||
node_information = getNodeInformation(zkhandler, node)
|
||||
if not node_information:
|
||||
return False, 'ERROR: Could not get information about node "{}".'.format(node)
|
||||
return False, "ERROR: Could not get information about node {}.".format(node)
|
||||
|
||||
return True, node_information
|
||||
|
||||
|
@ -644,7 +644,7 @@ def rename_vm(zkhandler, domain, new_domain):
|
||||
|
||||
# Verify that the VM is in a stopped state; renaming is not supported otherwise
|
||||
state = zkhandler.read(("domain.state", dom_uuid))
|
||||
if state != "stop":
|
||||
if state not in ["stop", "disable"]:
|
||||
return (
|
||||
False,
|
||||
'ERROR: VM "{}" is not in stopped state; VMs cannot be renamed while running.'.format(
|
||||
|
@ -540,7 +540,7 @@ class ZKHandler(object):
|
||||
#
|
||||
class ZKSchema(object):
|
||||
# Current version
|
||||
_version = 8
|
||||
_version = 9
|
||||
|
||||
# Root for doing nested keys
|
||||
_schema_root = ""
|
||||
@ -569,6 +569,7 @@ class ZKSchema(object):
|
||||
"domain": f"{_schema_root}/domains",
|
||||
"network": f"{_schema_root}/networks",
|
||||
"storage": f"{_schema_root}/ceph",
|
||||
"storage.health": f"{_schema_root}/ceph/health",
|
||||
"storage.util": f"{_schema_root}/ceph/util",
|
||||
"osd": f"{_schema_root}/ceph/osds",
|
||||
"pool": f"{_schema_root}/ceph/pools",
|
||||
@ -608,6 +609,18 @@ class ZKSchema(object):
|
||||
"sriov": "/sriov",
|
||||
"sriov.pf": "/sriov/pf",
|
||||
"sriov.vf": "/sriov/vf",
|
||||
"monitoring.plugins": "/monitoring_plugins",
|
||||
"monitoring.data": "/monitoring_data",
|
||||
"monitoring.health": "/monitoring_health",
|
||||
},
|
||||
# The schema of an individual monitoring plugin data entry (/nodes/{node_name}/monitoring_data/{plugin})
|
||||
"monitoring_plugin": {
|
||||
"name": "", # The root key
|
||||
"last_run": "/last_run",
|
||||
"health_delta": "/health_delta",
|
||||
"message": "/message",
|
||||
"data": "/data",
|
||||
"runtime": "/runtime",
|
||||
},
|
||||
# The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf})
|
||||
"sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, # The root key
|
||||
@ -874,9 +887,10 @@ class ZKSchema(object):
|
||||
if not zkhandler.zk_conn.exists(nkipath):
|
||||
result = False
|
||||
|
||||
# One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be
|
||||
# managed here as well, but those are created automatically every time pvcnoded starts
|
||||
# and thus never need to be validated or applied.
|
||||
# One might expect child keys under node (specifically, sriov.pf, sriov.vf,
|
||||
# monitoring.data) to be managed here as well, but those are created
|
||||
# automatically every time pvcnoded started and thus never need to be validated
|
||||
# or applied.
|
||||
|
||||
# These two have several children layers that must be parsed through
|
||||
for elem in ["volume"]:
|
||||
|
141
debian/changelog
vendored
@ -1,3 +1,144 @@
|
||||
pvc (0.9.76-0) unstable; urgency=high
|
||||
|
||||
* [API, Client CLI] Corrects some missing node states for fencing in status output
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Mon, 18 Sep 2023 10:15:52 -0400
|
||||
|
||||
pvc (0.9.75-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Adds a startup message about IPMI when succeeding
|
||||
* [Node Daemon] Fixes a bug in fencing allowing non-failing VMs to migrate
|
||||
* [Node Daemon] Adds rounding to load average in load plugin for consistency
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sat, 16 Sep 2023 23:06:38 -0400
|
||||
|
||||
pvc (0.9.74-0) unstable; urgency=high
|
||||
|
||||
* [Docs] Removes docs from the main repo
|
||||
* [Client CLI] Ensures that "provision" VMs are shown in the right colour
|
||||
* [Node Daemon] Separates the node monitoring subsystem into its own thread with a longer, customizable update interval
|
||||
* [Node Daemon] Adds checks for PSU input power reundancy (psur) and hardware RAID (hwrd)
|
||||
* [Node Daemon] Updates when Keepalive start messages are printed (end of run, with runtime) to align with new monitoring messages
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sat, 16 Sep 2023 00:18:13 -0400
|
||||
|
||||
pvc (0.9.73-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Fixes a bug creating monitoring instance
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sat, 02 Sep 2023 02:16:19 -0400
|
||||
|
||||
pvc (0.9.72-0) unstable; urgency=high
|
||||
|
||||
* [CLI] Restores old functionality for default node value
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Fri, 01 Sep 2023 16:34:45 -0400
|
||||
|
||||
pvc (0.9.71-0) unstable; urgency=high
|
||||
|
||||
* [API] Adds API support for Debian Bookworm
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Fri, 01 Sep 2023 00:30:42 -0400
|
||||
|
||||
pvc (0.9.70-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Fixes several compatibility issues for Debian 12 "Bookworm"
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Thu, 31 Aug 2023 14:15:54 -0400
|
||||
|
||||
pvc (0.9.69-0) unstable; urgency=high
|
||||
|
||||
* [Node Daemon] Ensures that system load is always 2 decimal places on Bookworm
|
||||
* [Node Daemon] Fixes bug blocking primary takeover at DNS Aggregator start if Patroni is down
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Tue, 29 Aug 2023 22:01:22 -0400
|
||||
|
||||
pvc (0.9.68-0) unstable; urgency=high
|
||||
|
||||
* [CLI] Fixes another bug with network info view
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sun, 27 Aug 2023 20:59:23 -0400
|
||||
|
||||
pvc (0.9.67-0) unstable; urgency=high
|
||||
|
||||
* [CLI] Fixes several more bugs in the refactored CLI
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sun, 27 Aug 2023 14:47:20 -0400
|
||||
|
||||
pvc (0.9.66-0) unstable; urgency=high
|
||||
|
||||
* [CLI] Fixes a missing YAML import in CLI
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Sun, 27 Aug 2023 11:36:05 -0400
|
||||
|
||||
pvc (0.9.65-0) unstable; urgency=high
|
||||
|
||||
* [CLI] Fixes a bug in the node list filtering command
|
||||
* [CLI] Fixes a bug/default when no connection is specified
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Wed, 23 Aug 2023 01:56:57 -0400
|
||||
|
||||
pvc (0.9.64-0) unstable; urgency=high
|
||||
|
||||
**Breaking Change [CLI]**: The CLI client root commands have been reorganized. The following commands have changed:
|
||||
|
||||
* `pvc cluster` -> `pvc connection` (all subcommands)
|
||||
* `pvc task` -> `pvc cluster` (all subcommands)
|
||||
* `pvc maintenance` -> `pvc cluster maintenance`
|
||||
* `pvc status` -> `pvc cluster status`
|
||||
|
||||
Ensure you have updated to the latest version of the PVC Ansible repository before deploying this version or using PVC Ansible oneshot playbooks for management.
|
||||
|
||||
**Breaking Change [CLI]**: The `--restart` option for VM configuration changes now has an explicit `--no-restart` to disable restarting, or a prompt if neither is specified; `--unsafe` no longer bypasses this prompt which was a bug. Applies to most `vm <cmd> set` commands like `vm vcpu set`, `vm memory set`, etc. All instances also feature restart confirmation afterwards, which, if `--restart` is provided, will prompt for confirmation unless `--yes` or `--unsafe` is specified.
|
||||
|
||||
**Breaking Change [CLI]**: The `--long` option previously on some `info` commands no longer exists; use `-f long`/`--format long` instead.
|
||||
|
||||
* [CLI] Significantly refactors the CLI client code for consistency and cleanliness
|
||||
* [CLI] Implements `-f`/`--format` options for all `list` and `info` commands in a consistent way
|
||||
* [CLI] Changes the behaviour of VM modification options with "--restart" to provide a "--no-restart"; defaults to a prompt if neither is specified and ignores the "--unsafe" global entirely
|
||||
* [API] Fixes several bugs in the 3-debootstrap.py provisioner example script
|
||||
* [Node] Fixes some bugs around VM shutdown on node flush
|
||||
* [Documentation] Adds mentions of Ganeti and Harvester
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Fri, 18 Aug 2023 12:20:43 -0400
|
||||
|
||||
pvc (0.9.63-0) unstable; urgency=high
|
||||
|
||||
* Mentions Ganeti in the docs
|
||||
* Increases API timeout back to 2s
|
||||
* Adds .update-* configs to dpkg plugin
|
||||
* Adds full/nearfull OSD warnings
|
||||
* Improves size value handling for volumes
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Fri, 28 Apr 2023 14:47:04 -0400
|
||||
|
||||
pvc (0.9.62-0) unstable; urgency=high
|
||||
|
||||
* [all] Adds an enhanced health checking, monitoring, and reporting system for nodes and clusters
|
||||
* [cli] Adds a cluster detail command
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Wed, 22 Feb 2023 18:13:45 -0500
|
||||
|
||||
pvc (0.9.61-0) unstable; urgency=high
|
||||
|
||||
* [provisioner] Fixes a bug in network comparison
|
||||
* [api] Fixes a bug being unable to rename disabled VMs
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Wed, 08 Feb 2023 10:08:05 -0500
|
||||
|
||||
pvc (0.9.60-0) unstable; urgency=high
|
||||
|
||||
* [Provisioner] Cleans up several remaining bugs in the example scripts; they should all be valid now
|
||||
* [Provisioner] Adjust default libvirt schema to disable RBD caching for a 2x+ performance boost
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Tue, 06 Dec 2022 15:42:55 -0500
|
||||
|
||||
pvc (0.9.59-0) unstable; urgency=high
|
||||
|
||||
* [API] Flips the mem(prov) and mem(free) selectors making mem(free) the default for "mem" and "memprov" explicit
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Tue, 15 Nov 2022 15:50:15 -0500
|
||||
|
||||
pvc (0.9.58-0) unstable; urgency=high
|
||||
|
||||
* [API] Fixes a bug where migration selector could have case-sensitive operational faults
|
||||
|
2
debian/control
vendored
@ -16,7 +16,7 @@ Description: Parallel Virtual Cluster node daemon (Python 3)
|
||||
|
||||
Package: pvc-daemon-api
|
||||
Architecture: all
|
||||
Depends: systemd, pvc-daemon-common, python3-yaml, python3-flask, python3-flask-restful, python3-celery, python-celery-common, python3-distutils, redis, python3-redis, python3-lxml, python3-flask-migrate, python3-flask-script, fio
|
||||
Depends: systemd, pvc-daemon-common, python3-yaml, python3-flask, python3-flask-restful, python3-celery, python-celery-common, python3-distutils, redis, python3-redis, python3-lxml, python3-flask-migrate, fio
|
||||
Description: Parallel Virtual Cluster API daemon (Python 3)
|
||||
A KVM/Zookeeper/Ceph-based VM and private cloud manager
|
||||
.
|
||||
|
2
debian/pvc-daemon-api.install
vendored
@ -1,5 +1,5 @@
|
||||
api-daemon/pvcapid.py usr/share/pvc
|
||||
api-daemon/pvcapid-manage.py usr/share/pvc
|
||||
api-daemon/pvcapid-manage*.py usr/share/pvc
|
||||
api-daemon/pvc-api-db-upgrade usr/share/pvc
|
||||
api-daemon/pvcapid.sample.yaml etc/pvc
|
||||
api-daemon/pvcapid usr/share/pvc
|
||||
|
1
debian/pvc-daemon-node.install
vendored
@ -5,3 +5,4 @@ node-daemon/pvcnoded.service lib/systemd/system
|
||||
node-daemon/pvc.target lib/systemd/system
|
||||
node-daemon/pvcautoready.service lib/systemd/system
|
||||
node-daemon/monitoring usr/share/pvc
|
||||
node-daemon/plugins usr/share/pvc
|
||||
|
164
docs/about.md
@ -1,164 +0,0 @@
|
||||
# About the Parallel Virtual Cluster system
|
||||
|
||||
- [About the Parallel Virtual Cluster system](#about-the-parallel-virtual-cluster-system)
|
||||
* [Project Motivation](#project-motivation)
|
||||
* [Building Blocks](#building-blocks)
|
||||
* [Cluster Architecture](#cluster-architecture)
|
||||
* [Clients](#clients)
|
||||
+ [API Client](#api-client)
|
||||
+ [Direct Bindings](#direct-bindings)
|
||||
+ [CLI Client](#cli-client)
|
||||
* [Deployment](#deployment)
|
||||
* [Frequently Asked Questions](#frequently-asked-questions)
|
||||
+ [General Questions](#general-questions)
|
||||
+ [Feature Questions](#feature-questions)
|
||||
+ [Storage Questions](#storage-questions)
|
||||
* [About The Author](#about-the-author)
|
||||
|
||||
This document contains information about the project itself, the software stack, its motivations, and a number of frequently-asked questions.
|
||||
|
||||
## Project Motivation
|
||||
|
||||
Server administration has changed significantly in recent decades. Computing-as-a-resource and software-defined infrastructure is now the norm, and the days of pet servers, painstaking manual configurations, and installing from CR-ROM ISOs is long gone. This is a brave new world.
|
||||
|
||||
As part of these trends, Infrastructure-as-a-Service (IaaS) has become a critical component of server administration. Administrators and developers are increasingly interfacing with their infrastructure via programmable APIs and software tools, and automation is a hard requirement. While Container infrastructure like Docker and Kubernetes has become more and more popular in this space, Virtual Machines (VMs) are still a very common feature and do not seem to be going anywhere any time soon.
|
||||
|
||||
However, the current state of the free and open source virtualization ecosystem is lacking.
|
||||
|
||||
At the lower end, projects like ProxMox provide an easy way to administer small virtualization clusters, but these projects tend to lack advanced redundancy facilities that are built-in by default. While there are some new contenders in this space, such as Harvester, the options are limited and their feature-sets and tool stacks can be cumbersome or unproven.
|
||||
|
||||
At the higher end, very large projects like OpenStack and CloudStack provide very advanced functionality, but these project are sprawling and complicated for Administrators to use, and are very focused on large enterprise deployments, not suitable for smaller clusters and teams.
|
||||
|
||||
Finally, proprietary solutions dominate this space. VMWare and Nutanix are the two largest names, with these products providing functionality for both small and large clusters, but proprietary software limits both flexibility and freedom, and the costs associated with these solutions is immense.
|
||||
|
||||
PVC aims to bridge the gaps between these three categories. Like the larger FLOSS and proprietary projects, PVC can scale up to very large cluster sizes, while remaining usable even for small clusters as well. Like the smaller FLOSS and proprietary projects, PVC aims to be very simple to use, with a fully programmable API, allowing administrators to get on with more important things. Like the other FLOSS solutions, PVC is free, both as in beer and as in speech, allowing the administrator to inspect, modify, and tailor it to their needs. And finally, PVC is built from the ground-up to support host-level redundancy at every layer, rather than this being an expensive, optional, or tacked on feature.
|
||||
|
||||
In short, it is a Free Software, scalable, redundant, self-healing, and self-managing private cloud solution designed with administrator simplicity in mind.
|
||||
|
||||
## Building Blocks
|
||||
|
||||
PVC is build from a number of other, open source components. The main system itself is a series of software daemons (services) written in Python 3, with the CLI interface also written in Python 3.
|
||||
|
||||
Virtual machines themselves are run with the Linux KVM subsystem via the Libvirt virtual machine management library. This provides the maximum flexibility and compatibility for running various guest operating systems in multiple modes (fully-virtualized, para-virtualized, virtio-enabled, etc.).
|
||||
|
||||
To manage cluster state, PVC uses Zookeeper. This is an Apache project designed to provide a highly-available and always-consistent key-value database. The various daemons all connect to the distributed Zookeeper database to both obtain details about cluster state, and to manage that state. For instance the node daemon watches Zookeeper for information on what VMs to run, networks to create, etc., while the API writes to or reads information from Zookeeper in response to requests. The Zookeeper database is the glue which holds the cluster together.
|
||||
|
||||
Additional relational database functionality, specifically for the managed network DNS aggregation subsystem and the VM provisioner, is provided by the PostgreSQL database system and the Patroni management tool, which provides automatic clustering and failover for PostgreSQL database instances.
|
||||
|
||||
Node network routing for managed networks providing EBGP VXLAN and route-learning is provided by FRRouting, a descendant project of Quaaga and GNU Zebra. Upstream routers can use this interface to learn routes to cluster networks as well. PVC also makes extensive use of the standard Linux `iprouting` stack.
|
||||
|
||||
The storage subsystem is provided by Ceph, a distributed object-based storage subsystem with proven stability, extensive scalability, self-managing, and self-healing functionality. The Ceph RBD (RADOS Block Device) subsystem is used to provide VM block devices similar to traditional LVM or ZFS zvols, but in a distributed, shared-storage manner.
|
||||
|
||||
All the components are designed to be run on top of Debian GNU/Linux, specifically Debian 10.x "Buster" or 11.x "Bullseye", with the SystemD system service manager. This OS provides a stable base to run the various other subsystems while remaining truly Free Software, while SystemD provides functionality such as automatic daemon restarting and complex startup/shutdown ordering.
|
||||
|
||||
## Cluster Architecture
|
||||
|
||||
A PVC cluster is based around "nodes", which are physical servers on which the various daemons, storage, networks, and virtual machines run. Each node is self-contained and is able to perform any and all cluster functions if needed and configured to do so; there is no strict segmentation of function between different "types" of physical hosts. Ideally, all nodes in a cluster will be identical in specifications, but in some situations mismatched nodes are acceptable, with limitations.
|
||||
|
||||
A subset of the nodes, called "coordinators", are statically configured to provide services for the cluster. For instance, all databases, FRRouting instances, and Ceph management daemons run only on the set of cluster coordinators. At cluster bootstrap, 1 (testing-only), 3 (small clusters), or 5 (large clusters) nodes may be chosen as the coordinators. Other nodes can then be added as "hypervisor" nodes, which then provide only block device (storage) and VM (compute) functionality by connecting to the set of coordinators. This limits the scaling problem of the databases while ensuring there is still maximum redundancy and resiliency for the core cluster services.
|
||||
|
||||
Additional nodes can be added to the cluster either as coordinators, or as hypervisors, by adding them to the Ansible configuration and running it against the full set of nodes. Note that the number of coordinators must always be odd, and more than 5 coordinators are normally unnecessary and can cause issues with the database; it is thus normally advisable to add any nodes beyond the initial set as hypervisors instead of coordinators. Nodes can be removed from service, but this is a manual process and should not be attempted unless absolutely required; the Ceph subsystem in particular is sensitive to changes in the coordinator nodes. Nodes can also be upgraded or replaced dynamically and without interrupting the cluster, allowing for seamless hardware maintenance, upgrades, and even replacement, as cluster state configuration is held cluster-wide.
|
||||
|
||||
During runtime, one coordinator is elected the "primary" for the cluster. This designation can shift dynamically in response to cluster events, or be manually migrated by an administrator. The coordinator takes on a number of roles for which only one host may be active at once, for instance to provide DHCP services to managed client networks or to interface with the API.
|
||||
|
||||
Nodes are networked together via a set of statically-configured, simple layer-2 networks. At a minimum, 2 discrete networks are required, with an optional 3rd.
|
||||
|
||||
* The "upstream" network is the primary network for the nodes, and provides functions such as upstream Internet access, routing to and from the cluster nodes, and management via the API; it may be either a firewalled public or NAT'd RFC1918 network, but should never be exposed directly to the Internet. It should also contain, or be able to route to, the IPMI BMC management interfaces of the node chassis'.
|
||||
* The "cluster" network is an unrouted RFC1918 network which provides inter-node communication for managed client network traffic (VXLANs), cross-node routing, VM migration and failover, and database replication and access.
|
||||
* The "storage" network is another unrouted RFC1918 network which provides a dedicated logical and/or physical link between the nodes for storage traffic, including VM block device storage traffic, inter-OSD replication traffic, and Ceph heartbeat traffic, thus allowing it to be completely isolated from the other networks for maximum performance. This network can be optionally colocated with the "cluster" network, by specifying the same device for both, and can be further combined by specifying the same IP for both to completely collapse the "cluster" and "storage" networks. A collapsed cluster+storage configuration may be ideal to simplify management of small clusters, or a split configuration can be used to provide flexibility for large or demanding high-performance clusters - this choice is left to the administrator based on their needs.
|
||||
|
||||
Within each network is a single "floating" IP address which follows the primary coordinator, providing a single interface to the cluster. Once configured, the cluster is then able to create additional networks of two kinds, "bridged" traditional vLANs and "managed" routed VXLANs, to provide network access to VMs.
|
||||
|
||||
Further information about the general cluster architecture, including important considerations for node specifications/sizing and network configuration, [can be found at the cluster architecture page](/cluster-architecture). It is imperative that potential PVC administrators read this document thoroughly to understand the specific requirements of PVC and avoid potential missteps in obtaining and deploying their cluster.
|
||||
|
||||
## Clients
|
||||
|
||||
### API Client
|
||||
|
||||
The API client is a Flask-based RESTful API and is the core interface to PVC. By default the API will run on the primary coordinator, listening on TCP port 7370 on the "upstream" network floating IP address. All other clients communicate with this API to perform actions against the cluster. The API features basic authentication using UUID-based API keys to prevent unauthorized access, and can optionally be configured with full TLS encryption to provide integrity and confidentiality across public networks.
|
||||
|
||||
The API generally accepts all requests as HTTP form requests following standard RESTful guidelines, supporting arguments in the URI string or, with limited exceptions, in the message body. The API returns JSON response bodies to all requests consisting either of the information requested, or a `{ "message": "text" }` construct to pass informational status messages back to the client.
|
||||
|
||||
The API client manual can be found at the [API manual page](/manuals/api), and the full API details can be found in the [API reference specification](/manuals/api-reference.html).
|
||||
|
||||
### Direct Bindings
|
||||
|
||||
The API client uses a dedicated set of Python libraries, packaged as the `pvc-daemon-common` Debian package, to communicate with the cluster. One can thus use these libraries to build custom Python clients that directly interface with the PVC cluster, without having to get "into the weeds" of the Zookeeper or PostgreSQL databases.
|
||||
|
||||
### CLI Client
|
||||
|
||||
The CLI client is a Python Click application, which provides a convenient CLI interface to the API client. It supports connecting to multiple clusters from a single instance, with or without authentication and over both HTTP or HTTPS, including a special "local" cluster if the client determines that an API configuration exists on the local host. Information about the configured clusters is stored in a local JSON document, and a default cluster can be set with an environment variable. The CLI client can thus be run either on PVC nodes themselves, or on other, remote systems which can then interface with cluster(s) over the network.
|
||||
|
||||
The CLI client is self-documenting using the `-h`/`--help` arguments throughout, easing the administrator learning curve and providing easy access to command details. A short manual can also be found at the [CLI manual page](/manuals/cli).
|
||||
|
||||
## Deployment
|
||||
|
||||
The overall management, deployment, bootstrapping, and configuring of nodes is accomplished via a set of Ansible roles and playbooks, found in the [`pvc-ansible` repository](https://github.com/parallelvirtualcluster/pvc-ansible), and nodes are installed via a custom installer ISO generated by the [`pvc-installer` repository](https://github.com/parallelvirtualcluster/pvc-installer). Once the cluster is set up, nodes can be added, replaced, updated, or reconfigured using this Ansible framework.
|
||||
|
||||
Details about the Ansible setup and node installer can be found in those repositories.
|
||||
|
||||
The [getting started documentation](/getting-started) provides a walk-through of using these tools to bootstrap a new cluster.
|
||||
|
||||
## Frequently Asked Questions
|
||||
|
||||
### General Questions
|
||||
|
||||
#### What is it?
|
||||
|
||||
PVC is a virtual machine management suite designed around high-availability and ease-of-use. It can be considered an alternative to OpenStack, ProxMox, Nutanix, and other similar solutions that manage not just the VMs, but the surrounding infrastructure as well.
|
||||
|
||||
#### Why would you make this?
|
||||
|
||||
After becoming frustrated by numerous other management tools, I discovered that what I wanted didn't exist as FLOSS software, so I built it myself. Since then, I have also been able to leverage PVC both for my own purposes as well as for my employer, a win-win for the project.
|
||||
|
||||
#### Is PVC right for me?
|
||||
|
||||
PVC might be right for you if:
|
||||
|
||||
1. You need KVM-based VMs.
|
||||
2. You want management of storage and networking (a.k.a. "batteries-included") in the same tool.
|
||||
3. You want hypervisor-level redundancy, able to tolerate hypervisor downtime seamlessly, for all elements of the stack.
|
||||
4. You have a requirement of at least 3 nodes' worth of compute and storage.
|
||||
|
||||
If all you want is a simple home server solution, or you demand scalability beyond a few dozen compute nodes, PVC is likely not what you're looking for. Its sweet spot is specifically in the 3-9 node range, for instance in an advanced homelab, for SMBs or small ISPs with a relatively small server stack, or for MSPs looking to deploy small on-premises clusters at low cost.
|
||||
|
||||
#### Is 3 hypervisors really the minimum?
|
||||
|
||||
For a redundant cluster, yes. PVC requires a majority quorum for proper operation at various levels, and the smallest possible majority quorum is 2-of-3; thus 3 nodes is the smallest safe minimum. That said, you can run PVC on a single node for testing/lab purposes without host-level redundancy, should you wish to do so, and it might also be possible to run 2 "main" systems with a 3rd "quorum observer" hosting only the management tools but no VMs; however these options are not officially supported, as PVC is designed primarily for 3+ node operation.
|
||||
|
||||
### Feature Questions
|
||||
|
||||
#### Does PVC support containers (Docker/Kubernetes/LXC/etc.)?
|
||||
|
||||
No, not directly. PVC supports only KVM VMs. To run containers, you would need to run a VM which then runs your containers. For instance PVC makes an excellent underlying layer for a virtual Kubernetes cluster, instead of bare hardware.
|
||||
|
||||
#### Does PVC have a WebUI?
|
||||
|
||||
Not yet. Right now, PVC management is done exclusively with the CLI interface to the API. A WebUI can and likely will be built in the future, but I'm not a frontend developer and I do not consider this a personal priority. As of late 2020 the API is generally stable, so I would welcome 3rd party assistance here.
|
||||
|
||||
#### I want feature X, does it fit with PVC?
|
||||
|
||||
That depends on the specific feature. I will limit features to those that align with the overall goals of PVC, that is to say, to provide an easy-to-use hyperconverged virtualization system focused on redundancy. If a feature suits this goal it is likely to be considered; if it does not, it will not. PVC is rapidly approaching the completion of its 1.0 road-map, which I consider feature-complete for the primary use-case, and future versions may expand in scope.
|
||||
|
||||
### Storage Questions
|
||||
|
||||
#### Can I use RAID-5/RAID-6 with PVC?
|
||||
|
||||
The short answer is no. The long answer is: Ceph, the storage backend used by PVC, does support "erasure coded" pools which implement a RAID-5-like (striped with distributed parity) functionality, but PVC does not support this for several reasons, mostly related to ease of management and performance. If you use PVC, you must accept at the very least a 2x storage penalty, and for true multi-node safety and resiliency, a 3x storage penalty for VM storage. This is a trade-off of the architecture and should be taken into account when sizing storage in nodes.
|
||||
|
||||
#### Can I use spinning HDDs with PVC?
|
||||
|
||||
You can, but you won't like the results. SSDs, and specifically datacentre-grade SSDs for resiliency, are required to obtain any sort of reasonable performance when running multiple VMs. The higher-performance the drives, the faster the storage.
|
||||
|
||||
#### What network speed does PVC require?
|
||||
|
||||
For optimal performance, nodes should use at least 10-Gigabit Ethernet network interfaces wherever possible, and on large clusters a dedicated 10-Gigabit "storage" network, separate from the "upstream"/"cluster" networks, is strongly recommended. The storage system performance, especially for writes, is more heavily bottlenecked by the network speed than the actual storage device speed when speaking of high-performance disks. 1-Gigabit Ethernet will be sufficient for some use-cases and is sufficient for the non-storage networks (VM traffic notwithstanding), but storage performance will become severely limited as the cluster grows. Even slower network speeds (e.g. 100-Megabit) are not sufficient for PVC to operate properly except in very limited testing scenarios.
|
||||
|
||||
#### What Ceph version does PVC use?
|
||||
|
||||
PVC requires Ceph 14.x (Nautilus). The official PVC repository at https://repo.bonifacelabs.ca includes Ceph 14.2.x for Debian Buster (updated regularly), since by default it only includes 12.x (Luminous).
|
||||
|
||||
## About The Author
|
||||
|
||||
PVC is written by [Joshua](https://www.boniface.me) [M.](https://bonifacelabs.ca) [Boniface](https://github.com/joshuaboniface). A Linux system administrator by trade, Joshua is always looking for the best solutions to his user's problems, be they developers or end users. PVC grew out of his frustration with the various FOSS virtualization tools, as well as and specifically, the constant failures of Pacemaker/Corosync to gracefully manage a virtualization cluster. He started work on PVC at the end of May 2018 as a simple alternative to a Corosync/Pacemaker-managed virtualization cluster, and has been growing the feature set and stability of the system ever since.
|
||||
|
@ -1,414 +0,0 @@
|
||||
# PVC Cluster Architecture considerations
|
||||
|
||||
- [PVC Cluster Architecture considerations](#pvc-cluster-architecture-considerations)
|
||||
* [Node Specification](#node-specification)
|
||||
+ [n-1 Redundancy](#n-1-redundancy)
|
||||
+ [CPU](#cpu)
|
||||
+ [Memory](#memory)
|
||||
+ [Disk](#disk)
|
||||
+ [Network](#network)
|
||||
* [PVC architecture](#pvc+architecture)
|
||||
+ [Operating System](#operating-system)
|
||||
+ [Ceph Storage Layout](#ceph-storage-layout)
|
||||
+ [Networks](#networks)
|
||||
- [System Networks](#system+networks)
|
||||
- [Client Networks](#client+networks)
|
||||
+ [Fencing and Recovery](#fencing-and-recovery)
|
||||
* [Advanced Layouts](#advanced+layouts)
|
||||
+ [Coordinators versus Hypervisors](#coordinators-versus-hypervisors)
|
||||
+ [Georedundancy](#georedundancy)
|
||||
* [Example System Diagrams](#example+system-diagrams)
|
||||
+ [Small 3-node cluster](#small-3-node-cluster)
|
||||
+ [Large 8-node cluster](#large-8-node-cluster)
|
||||
|
||||
This document contains considerations the administrator should make when preparing for and building a PVC cluster. It is important that prospective PVC administrators read this document *thoroughly* before deploying a cluster to ensure they understand the requirements, caveats, and important details about how PVC operates.
|
||||
|
||||
## Node Specification
|
||||
|
||||
PVC nodes, especially coordinator nodes, run a significant number of software applications in addition to the virtual machines (VMs). It is therefore extremely important to size the systems correctly for the expected workload while planning both for redundancy and future capacity. In general, taller nodes are better for performance, providing a more powerful cluster on fewer physical machines, though each workload may be different in this regard.
|
||||
|
||||
The following table provides recommended minimum specifications for each component of the cluster nodes. In general, these minimums are the lowest possible for a production-quality cluster that would provide decent performance for up to about a dozen virtual machines. Of course, further upward scaling is recommended and the specific computational and storage needs of the VM workloads should be taken into account.
|
||||
|
||||
| Resource | Recommended Minimum |
|
||||
| -------- | --------------------|
|
||||
| CPU generation | Intel Sandy Bridge (2011) *or* AMD Naples (2017) |
|
||||
| CPU cores per node | 8 @ 2.0GHz |
|
||||
| RAM per node | 32GB |
|
||||
| System disk (SSD/HDD/USB/SD/eMMC) | 2x 100GB RAID-1 |
|
||||
| Data disk (SSD only) | 1x 400GB |
|
||||
| Network interfaces | 2x 10Gbps (LACP LAG) |
|
||||
| Remote IPMI-over-IP | Available and connected |
|
||||
| Total CPU cores (3 nodes healthy) | 24 |
|
||||
| Total CPU cores (3 nodes n-1) | 16 |
|
||||
| Total RAM (3 nodes healthy) | 96GB |
|
||||
| Total RAM (3 nodes n-1) | 64GB |
|
||||
| Total disk space (3 nodes) | 400GB |
|
||||
|
||||
For testing, or low-budget homelab applications, some aspects can be further tuned down, however consider the following sections carefully.
|
||||
|
||||
### n-1 Redundancy
|
||||
|
||||
Care should be taken to examine the "healthy" versus "n-1" total resource availability. Under normal operation, PVC will use all available resources and distribute VMs across all cluster nodes. However, during single-node failure or maintenance conditions, all VMs will be required to run on the remaining hypervisors. Thus, care should be taken during planning to ensure there is sufficient resources for the expected workload of the cluster.
|
||||
|
||||
The general values for default resource availability of a 3-node cluster for n-1 availability (1 node offline) are:
|
||||
|
||||
* 1/3 of the total data disk space (3 copies of all data, distributed across all 3 nodes)
|
||||
* 2/3 of the total RAM
|
||||
* 2/3 of the total CPU cores
|
||||
|
||||
For memory provisioning of VMs, PVC will warn the administrator, via a Degraded cluster state, if the "n-1" RAM quantity is exceeded by the total maximum allocation of all running VMs. If nodes are of mismatched sizes, the "n-1" RAM quantity is calculated by removing (one of) the largest node in the cluster and adding the remaining nodes' RAM counts together.
|
||||
|
||||
### CPU
|
||||
|
||||
CPU resources are a very important part of the overall performance of a PVC cluster. Numerous aspects of the system require high-performance CPU cores, including the VM workloads themselves, the PVC databases, and, especially, the Ceph storage subsystem.
|
||||
|
||||
As a general rule, more cores, and faster cores, are always better, and real cores are preferable to SMT virtual cores in most cases.
|
||||
|
||||
#### SMT
|
||||
|
||||
SMT in particular can be a contentious subject, and performance can vary wildly for different workloads; thus, while they are useful, in terms of performance calculations they should always be considered as an afterthought or "bonus" to assist with many VMs contending for resources, and base specifications should be done based on the number of real CPU cores instead.
|
||||
|
||||
#### CPU core counts
|
||||
|
||||
The following should be considered recommended minimums for CPU core allocations:
|
||||
|
||||
* PVC system daemons, including Zookeeper and PostgreSQL databases: 2 CPU cores
|
||||
* Ceph Monitor and Manager processes: 1 CPU core
|
||||
* Ceph OSD processes: 2 CPU cores *per OSD disk*
|
||||
* Virtual Machines: 1 CPU core per vCPU in the largest spec'd VM (e.g. 12 vCPUs in a VM = 12 cores here)
|
||||
|
||||
To provide an example, consider a cluster that would run 2 OSD disks per node, and want to run several VMs, the largest of which would require 12 vCPUs:
|
||||
|
||||
* PVC system: 2 cores
|
||||
* Ceph Mon/Mgr: 1 core
|
||||
* Ceph OSDs: 2 * 2 = 4 cores
|
||||
* VMs: 12 cores
|
||||
|
||||
This gives a total of 19 cores, and thus a 20+ core CPU would be recommended.
|
||||
|
||||
Additional CPU cores, as previously mentioned, are always better. For instance, though 2 is the recommended minimum per OSD disk, better performance can be achieved if there are 4 cores available per OSD instead. This trade-off depends heavily on the required workload and VM specifications and should be carefully considered.
|
||||
|
||||
#### CPU performance
|
||||
|
||||
While CPU frequency is not a tell-all or even particularly useful metric across generations or manufacturers, within a specific generation and manufacturer, faster CPUs will almost always improve performance across the board, especially when considering the Ceph storage subsystem. If a 2.0GHz and a 2.6GHz CPU of the same core count are both available, the 2.6GHz one is almost always the better choice from a pure performance perspective.
|
||||
|
||||
### Memory
|
||||
|
||||
Memory is extremely important to PVC clusters, and like CPU resources a not-insignificant amount of memory is required for the baseline cluster before VMs are considered.
|
||||
|
||||
#### Memory allocations
|
||||
|
||||
The following should be considered recommended minimums for memory allocations:
|
||||
|
||||
* PVC daemons: 1 GB
|
||||
* Zookeeper database: 1 GB
|
||||
* PostgreSQL database: 1 GB
|
||||
* Ceph Monitor and Manager processes: 1 GB
|
||||
* Ceph OSD processes: 1 GB *per OSD disk*
|
||||
|
||||
All additional memory can be consumed by virtual machines.
|
||||
|
||||
To provide an example, in the same cluster as mentioned in the CPU section:
|
||||
|
||||
* PVC system: 1 GB
|
||||
* Zookeeper: 1 GB
|
||||
* PostgreSQL: 1 GB
|
||||
* Ceph Mon/Mgr: 1 GB
|
||||
* Ceph OSDs: 2 * 1 GB = 2 GB
|
||||
|
||||
This gives a total of 6 GB of memory for the base system, with VMs requiring additional memory.
|
||||
|
||||
#### VM Memory Overprovisioning
|
||||
|
||||
An important consideration is that the KVM hypervisor used by PVC will only allocate guest memory *as required by the guest*, but PVC tracks memory allocation based on the allocated maximum. Thus, for example, a VM may be allocated 8192 MB of memory in PVC, and thus the PVC system considers 8 GB "allocated" and "provisioned" to this VM, but if the actual guest is only using 500 MB of that memory, the actual memory usage on the hypervisor node will be 500 MB for that VM. Thus it is possible for "all" memory to be allocated on a node but there still be many GB of "free" memory. This is an intentional design decision to avoid excessive overprovisioning of memory and thus situations where non-VM processes become memory starved, as the PVC system itself does *not* track the usage by the aforementioned processes.
|
||||
|
||||
#### Memory Performance
|
||||
|
||||
Given the recommended CPU requirements, all PVC hypervisors should contain at least DDR3 memory, which is sufficiently performant for all tasks. Memory latency and performance, however, can become important especially in large NUMA systems, and especially with regards to the Ceph storage subsystem. Care should be taken to optimize the memory layout in nodes, for instance making use of all available memory channels in the CPU architecture and preferring 1 DIMM-per-channel (DPC) over 2 DPC.
|
||||
|
||||
#### Ceph OSD memory utilization
|
||||
|
||||
While the recommended *minimum* is 1 GB per OSD process, in reality, Ceph can allocate between 4 and 6 GB of memory per OSD process, especially for caching metadata and other frequently-used data. Thus, for maximum performance, 4 GB instead of 1 GB should be allocated per-OSD.
|
||||
|
||||
#### Memory limit tuning
|
||||
|
||||
The PVC Ansible deployment system allows the administrator to specify limits on some aspects of the aforementioned memory requirements, for instance limiting Zookeeper or Ceph OSD processes to lower amounts of memory. This is not recommended except in situations where memory is extremely constrained; in such situations adding additional memory to nodes is always preferable. For details and examples please see the Ansible variable files.
|
||||
|
||||
### Disk
|
||||
|
||||
#### System Disks
|
||||
|
||||
The performance of system disks is of critical importance in the PVC cluster. At least 32GB of space are required, and at least 100GB is recommended to ensure optimal performance. The system disks should be fast SAS HDDs, SSDs, eMMC flash, class-10 SD, or other flash-based mediums, and RAID-1 is critical for reliability purposes, especially for more wear- or failure-sensitive media types.
|
||||
|
||||
PVC will store the various databases on these disks, so overall performance can affect the responsiveness of the system. However note that no VM data is ever stored on system disks; this is provided exclusively by the Ceph data disks (OSDs).
|
||||
|
||||
#### Ceph OSD disks
|
||||
|
||||
All VM block devices are stored on Ceph OSD data disks. The default pool configuration of the Ceph storage subsystem uses a `copies=3` layout with a `host`-level failure domain; thus, in a 3-node cluster, each block of data is stored 3 times, once per node. This ensures that 2 copies of each piece of data are available even if a host is down, at the cost of 1/3 of the total overall storage space. Other configurations are possible, but this is the minimum recommended.
|
||||
|
||||
The performance of VM disks will be dictated almost exclusively by the performance of these disks in combination with the CPU resources of the system as discussed previously. Very fast, robust, and resilient storage is highly recommended for OSD disks to maximize performance and longevity. High-performance SATA, SAS, or NVMe SSDs are recommended for this task, sized according to the expected workload. Spinning disks (HDDs) are *not* recommended for this purpose, and their very low random performance will significantly limit the overall storage performance of the cluster.
|
||||
|
||||
Initially, it is optimal if all nodes contain the same number and same size of OSD disks, to ensure even distribution of the data across all disks and thus maximize performance. PVC supports adding additional OSDs at a later time, however the administrator should be cautious to always add new disks in parallel on all nodes at the same time, as otherwise the replication ratio will prevent the new space from being utilized. Thus, in a 3-node cluster, disks must be added 3-at-a-time to all 3 nodes, and these disks must be identically sized, in order to increase the total usable storage space by the value of one of these disks.
|
||||
|
||||
In addition to the primary data disks, PVC also supports the offloading of the Ceph BlueStore OSD database and WAL functions of the OSDs onto a separate OSD database volume group on a dedicated storage device. In the normal use-case, this would be an extremely fast and endurant Intel Optane or similar extremely-performant NVMe SSD which is significantly faster than the primary data SSDs. This will help accelerate random write I/Os and metadata lookups, especially when using lower-performance SATA or SAS SSDs. Generally speaking this volume should be large enough to support 5% of the capacity of all OSDs on a node, with some room for future expansion. Only one such device and volume group is supported at this time.
|
||||
|
||||
### Network
|
||||
|
||||
Because PVC makes extensive use of cross-node communications, high-throughput and low-latency networking is critical. At a minimum, 10-gigabit networking is recommended to ensure suitable throughput for the storage subsystem as well as for VM traffic. Higher-speed networking can also improve performance, especially when using extremely fast Ceph OSD disks.
|
||||
|
||||
A minimum of 2 network interfaces is recommended. These should then be combined into a logical aggregate (LAG) using 802.3ad (LACP) to provide redundant links and a boost in available bandwidth. Additional NICs can also be used to separate discrete parts of the networking stack, which will be discussed below.
|
||||
|
||||
#### Remote IPMI-over-IP
|
||||
|
||||
IPMI provides a method to manage the physical chassis' of nodes from outside of their operating system. Common implementations include Dell iDRAC, HP iLO, Cisco CIMC, and others.
|
||||
|
||||
PVC nodes in production deployments should always feature an IPMI-over-IP interface of some kind, which is then reachable either in, or via, the Upstream system network (see [System Networks](#system-networks)). This requirement is discussed in more detail during the [Fencing and Recovery](#fencing-and-recovery) section below.
|
||||
|
||||
## PVC Architecture
|
||||
|
||||
### Operating System
|
||||
|
||||
As an underlying OS, only Debian GNU/Linux 10.x "Buster" or 11.x "Bullseye" are supported by PVC. This is the operating system installed by the PVC [node installer](https://github.com/parallelvirtualcluster/pvc-installer) and expected by the PVC [Ansible configuration system](https://github.com/parallelvirtualcluster/pvc-ansible). Ubuntu or other Debian-derived distributions may work, but are not officially supported. PVC also makes use of a custom repository to provide the PVC software and (for Debian Buster) an updated version of Ceph beyond what is available in the base operating system, and this is only compatible officially with Debian 10 or 11. PVC will generally be upgraded regularly to support new Debian versions. As a rule, using the current versions of the official node installer and Ansible repository is the preferred and only supported method for deploying PVC.
|
||||
|
||||
Currently, only the `amd64` (Intel 64 or AMD64) architecture is officially supported by PVC. Given the cross-platform nature of Python and the various software components in Debian, it may work on `armhf` or `arm64` systems as well, however this has not been tested by the author and is not officially supported at this time.
|
||||
|
||||
### Ceph Storage Layout
|
||||
|
||||
PVC makes use of Ceph, a distributed, replicated, self-healing, and self-managing storage system to provide shared VM storage. While a PVC administrator is not required to understand Ceph for day-to-day administration, and PVC provides interfaces to most of the common storage functions required to operate a cluster, at least some knowledge of Ceph is advisable.
|
||||
|
||||
The Ceph subsystem of PVC creates a "hyperconverged" cluster whereby storage and VM hypervisor functions are collocated onto the same physical servers; PVC does not differentiate between "storage" and "compute" nodes, and while storage support can be disabled and an external Ceph cluster used, this is not recommended. The performance of the storage must be taken into account when sizing the nodes as mentioned above.
|
||||
|
||||
Ceph on PVC is laid out similar to the other daemons. The Ceph Monitor and Manager functions are delegated to the Coordinators over the storage network, with all nodes connecting to these hosts to obtain the CRUSH maps and select OSD disks. OSDs are then distributed on all hosts, potentially including non-coordinator hypervisors if desired, and communicate with clients and each other over the storage network.
|
||||
|
||||
Disks must be balanced across all storage-containing nodes. For instance, adding 1 disk to 1 node is not sufficient to increase storage space; 1 disk must be added to all storage-containing nodes, based on the configured replication scheme of the various pools (see below), at the same time for the available space to increase. Ideally, disk sizes should also be identical across all storage disks, though the weight of each disk can be configured when added to the cluster. Generally speaking, fewer larger disks are preferable to many smaller disks to minimize storage resource utilization, however slightly more storage performance can be gained from using many small disks, if the other cluster hardware, and specifically CPUs, are performant enough. The administrator should therefore always aim to choose the biggest disks they can and grow by adding more identical disks as space or performance needs grow.
|
||||
|
||||
PVC Ceph pools make use of the replication mechanism of Ceph to store multiple copies of each object, thus ensuring that data is always available even when a host is unavailable. Only "replica"-based Ceph redundancy is supported by PVC; erasure coded pools are not supported due to major performance impacts related to rewrites and random I/O as well as management overhead.
|
||||
|
||||
The default replication level for a new pool is `copies=3, mincopies=2`. This will store 3 copies of each object, with a host-level failure domain, and will allow I/O as long as 2 copies are available. Thus, in a cluster of any size, all data is fully available even if a single host becomes unavailable. It will however use 3x the space for each piece of data stored, which must be considered when sizing the disk space for the cluster: a pool in this configuration, running on 3 nodes each with a single 400GB disk, will effectively have 400GB of total space available for use. As mentioned above, new disks must also be added in groups across nodes equal to the total number of `copies` to ensure new space is usable; for instance in a `copies=3` scheme, at least 3 disks must thus be added to different hosts at the same time for the available space to grow.
|
||||
|
||||
Non-default values can also be set at pool creation time. For instance, one could create a `copies=3, mincopies=1` pool, which would allow I/O with two hosts down, but leaves the cluster susceptible to a write hole should a disk fail in this state; this configuration is not recommended in most situations. Alternatively, for additional resilience, one could create a `copies=4, mincopies=2` pool, which would also allow 2 hosts to fail, without a write hole, but would consume 4x the space for each piece of data stored and require new disks to be added in groups of 4 instead. Practically any combination of values is possible, however these 3 are the most relevant for most use-cases, and for most, especially small, clusters, the default is sufficient to provide solid redundancy and guard against host failures until the administrator can respond.
|
||||
|
||||
Replication levels cannot be changed within PVC once a pool is created, however they can be changed via manual Ceph commands on a coordinator should the administrator require this, though discussion of this process is outside of the scope of this documentation. The administrator should carefully consider sizing, failure domains, and performance when first selecting storage devices and creating pools, to ensure the right level of resiliency versus data usage for their use-case and planned cluster size.
|
||||
|
||||
### Networks
|
||||
|
||||
At a minimum, a production PVC cluster should use at least two 10Gbps Ethernet interfaces, connected in an LACP or active-backup bond on one or more switches. On top of this bond, the various cluster networks should be configured as 802.3q vLANs. PVC is be able to support configurations without bonding or 802.1q vLAN support, using multiple physical interfaces and no bridged client networks, but this is strongly discouraged due to the added complexity this introduces; the switches chosen for the cluster should include these requirements as a minimum.
|
||||
|
||||
More advanced physical network layouts are also possible. For instance, one could have two isolated networks. On the first network, each node has two 10Gbps Ethernet interfaces, which are combined in a bond across two redundant switch fabrics and that handle the upstream and cluster networks. On the second network, each node has an additional two 10Gbps, which are also combined in a bond across the redundant switch fabrics and handle the storage network. This configuration could support up to 10Gbps of aggregate client traffic while also supporting 10Gbps of aggregate storage traffic. Even more complex network configurations are possible if the cluster requires such performance. See the [Example System Diagrams](#example-system-diagrams) section for some basic topology examples.
|
||||
|
||||
Only Ethernet networks are supported by PVC. More exotic interconnects such as Infiniband are not supported by default, and must be manually set up with Ethernet (e.g. EoIB) layers on top to be usable with PVC.
|
||||
|
||||
Lower-speed networks (e.g. 1Gbps or 100Mbps) should not be used as these will severely bottleneck the performance of the storage subsystem. In an advanced split layout, it may be acceptable to use 1Gbps interfaces for VM guest networks, however the core system networks should always be a minimum of 10Gbps.
|
||||
|
||||
PVC manages the IP addressing of all nodes itself and creates the required addresses during node daemon startup; thus, the on-boot network configuration of each interface should be set to "manual" with no IP addresses configured. This can be ignored safely, however, and the addresses specified manually in the networking configurations. PVC nodes use a split (`/etc/network/interfaces.d/<iface>`) network configuration model.
|
||||
|
||||
### System Networks
|
||||
|
||||
#### Upstream: Connecting the nodes to the wider world
|
||||
|
||||
The upstream network functions as the main upstream for the cluster nodes, providing Internet access and a way to route managed client network traffic out of the cluster. In most deployments, this should be an RFC1918 private subnet with an upstream router which can perform NAT translation and firewalling as required, both for the cluster nodes themselves, and also for any RFC1918 managed client networks.
|
||||
|
||||
The floating IP address in the cluster network can be used as a single point of communication with the active primary node, for instance to access the DNS aggregator instance or the management API. PVC provides only limited access control mechanisms to the API interface, so the upstream network should always be protected by a firewall; running PVC directly accessible on the Internet is strongly discouraged and may post a serious security risk, and all access should be restricted to the smallest possible set of remote systems.
|
||||
|
||||
Nodes in this network are generally assigned static IP addresses which are configured at node install time in the [Ansible deployment configuration](https://github.com/parallelvirtualcluster/pvc-ansible).
|
||||
|
||||
The upstream router should be able to handle static routes to the PVC cluster, or form a BGP neighbour relationship with the coordinator nodes and/or floating IP address to learn routes to the managed client networks.
|
||||
|
||||
The upstream network should generally be large enough to contain:
|
||||
|
||||
0. The upstream router(s)
|
||||
0. The nodes themselves
|
||||
0. In most deployments, the node IPMI management interfaces.
|
||||
|
||||
For example, for a 3+ node cluster, up to about 90 nodes, the following configuration might be used:
|
||||
|
||||
| Description | Address |
|
||||
|-------------|---------|
|
||||
| Upstream network | 10.0.0.0/24 |
|
||||
| Router VIP address | 10.0.0.1 |
|
||||
| Router 1 address | 10.0.0.2 |
|
||||
| Router 2 address | 10.0.0.3 |
|
||||
| PVC floating address | 10.0.0.10 |
|
||||
| node1 | 10.0.0.11 |
|
||||
| node2 | 10.0.0.12 |
|
||||
| etc. | etc. |
|
||||
| node1-ipmi | 10.0.0.111 |
|
||||
| node2-ipmi | 10.0.0.112 |
|
||||
| etc. | etc. |
|
||||
|
||||
For even larger clusters, a `/23` or even larger network may be used.
|
||||
|
||||
#### Cluster: Connecting the nodes with each other
|
||||
|
||||
The cluster network is an unrouted private network used by the PVC nodes to communicate with each other for database access and Libvirt migrations. It is also used as the underlying interface for the BGP EVPN VXLAN interfaces used by managed client networks.
|
||||
|
||||
The floating IP address in the cluster network can be used as a single point of communication with the active primary node.
|
||||
|
||||
Nodes in this network are generally assigned IPs automatically based on their node number (e.g. node1 at `.1`, node2 at `.2`, etc.). The network should be large enough to include all nodes sequentially.
|
||||
|
||||
Generally the cluster network should be completely separate from the upstream network, either a separate physical interface (or set of bonded interfaces) or a dedicated vLAN on an underlying physical device, but they can be collocated if required.
|
||||
|
||||
#### Storage: Connecting Ceph daemons with each other and with OSDs
|
||||
|
||||
The storage network is an unrouted private network used by the PVC node storage OSDs to communicated with each other, for Ceph management functionality, and for QEMU-to-Ceph disk access, without using the main cluster network and introducing potentially large amounts of traffic there.
|
||||
|
||||
The floating IP address in the storage network can be used as a single point of communication with the active primary node, though this will generally be of little use.
|
||||
|
||||
Nodes in this network are generally assigned IPs automatically based on their node number (e.g. node1 at `.1`, node2 at `.2`, etc.). The network should be large enough to include all nodes sequentially.
|
||||
|
||||
The administrator may choose to collocate the storage network on the same physical interface as the cluster network, or on a separate physical interface. This should be decided based on the size of the cluster and the perceived ratios of client network versus storage traffic. In large (>3 node) or storage-intensive clusters, this network should generally be a separate set of fast physical interfaces, separate from both the upstream and cluster networks, in order to maximize and isolate the storage bandwidth. If the administrator does choose to collocate these networks, they may also share the same IP address, thus eliminating any distinction between the Cluster and Storage networks. The PVC software handles this natively when the Cluster and Storage IPs of a node are identical.
|
||||
|
||||
### Client Networks
|
||||
|
||||
#### Bridged (unmanaged) Client Networks
|
||||
|
||||
The first type of client network is the unmanaged bridged network. These networks have a separate vLAN on the device underlying the other networks, which is created when the network is configured. VMs are then bridged into this vLAN.
|
||||
|
||||
With this client network type, PVC does no management of the network. This is left entirely to the administrator. It requires switch support and the configuration of the vLANs on the switchports of each node's physical interfaces before enabling the network.
|
||||
|
||||
Generally, the same physical network interface will underlay both the cluster networks as well as bridged client networks. PVC does however support specifying a separate physical device for bridged client networks, for instance to separate these networks onto a different physical interface from the main cluster networks.
|
||||
|
||||
#### VXLAN (managed) Client Networks
|
||||
|
||||
The second type of client network is the managed VXLAN network. These networks make use of BGP EVPN, managed by route reflection on the coordinators, to create virtual layer 2 Ethernet tunnels between all nodes in the cluster. VXLANs are then run on top of these virtual layer 2 tunnels, with the active primary PVC node providing routing, DHCP, and DNS functionality to the network via a single IP address.
|
||||
|
||||
With this client network type, PVC is in full control of the network. No vLAN configuration is required on the switchports of each node's physical interfaces, as the virtual layer 2 tunnel travels over the cluster layer 3 network. All client network traffic destined for outside the network will exit via the upstream network interface of the active primary coordinator node.
|
||||
|
||||
NOTE: These networks may introduce a bottleneck and tromboning if there is a large amount of external and/or inter-network traffic on the cluster. The administrator should consider this carefully when deciding whether to use managed or bridged networks and properly evaluate the inter-network traffic requirements.
|
||||
|
||||
#### SR-IOV Client Networks
|
||||
|
||||
The third type of client network is the SR-IOV network. SR-IOV (Single-Root I/O Virtualization) is a technique and feature enabled on modern high-performance NICs (for instance, those from Intel or nVidia) which allows a single physical Ethernet port (a "PF" in SR-IOV terminology) to be split, at a hardware level, into multiple virtual Ethernet ports ("VF"s), which can then be managed separately. Starting with version 0.9.21, PVC support SR-IOV PF and VF configuration at the node level, and these VFs can be passed into VMs in two ways.
|
||||
|
||||
SR-IOV's main benefit is to offload bridging and network functions from the hypervisor layer, and direct them onto the hardware itself. This can increase network throughput in some situations, as well as provide near-complete isolation of guest networks from the hypervisors (in contrast with bridges which *can* expose client traffic to the hypervisors, and VXLANs which *do* expose client traffic to the hypervisors). For instance, a VF can have a vLAN specified, and the tagging/untagging of packets is then carried out at the hardware layer.
|
||||
|
||||
There are however caveats to working with SR-IOV. At the most basic level, the biggest difference with SR-IOV compared to the other two network types is that SR-IOV must be configured on a per-node basis. That is, each node must have SR-IOV explicitly enabled, it's specific PF devices defined, and a set of VFs created at PVC startup. Generally, with identical PVC nodes, this will not be a problem but is something to consider, especially if the servers are mismatched in any way. It is thus also possible to set some nodes with SR-IOV functionality, and others without, though care must be taken in this situation to set node limits in the VM metadata of any VMs which use SR-IOV VFs to prevent failed migrations.
|
||||
|
||||
PFs are defined in the `pvcnoded.yml` configuration of each node, via the `sriov_device` list. Each PF can have an arbitrary number of VFs (`vfcount`) allocated, though each NIC vendor and model has specific limits. Once configured, specifically with Intel NICs, PFs (and specifically, the `vfcount` attribute in the driver) are immutable and cannot be changed easily without completely flushing the node and rebooting it, so care should be taken to select the desired settings as early in the cluster configuration as possible.
|
||||
|
||||
Once created, VFs are also managed on a per-node basis. That is, each VF, on each host, even if they have the exact same device names, is managed separately. For instance, the PF `ens1f0` creating a VF `ens1f0v0` on "`hv1`", can have a different configuration from the identically-named VF `ens1f0v0` on "`hv2`". The administrator is responsible for ensuring consistency here, and for ensuring that devices do not overlap (e.g. assigning the same VF name to VMs on two separate nodes which might migrate to each other). PVC will however explicitly prevent two VMs from being assigned to the same VF on the same node, even if this may be technically possible in some cases.
|
||||
|
||||
When attaching VFs to VMs, there are two supported modes: `macvtap`, and `hostdev`.
|
||||
|
||||
`macvtap`, as the name suggests, uses the Linux `macvtap` driver to connect the VF to the VM. Once attached, the vNIC behaves just like a "bridged" network connection above, and like "bridged" connections, the "mode" of the NIC can be specified, defaulting to "virtio" but supporting various emulated devices instead. Note that in this mode, vLANs cannot be configured on the guest side; they must be specified in the VF configuration (`pvc network sriov vf set`) with one vLAN per VF. VMs with `macvtap` interfaces can be live migrated between nodes without issue, assuming there is a corresponding free VF on the destination node, and the SR-IOV functionality is transparent to the VM.
|
||||
|
||||
`hostdev` is a direct PCIe pass-through method. With a VF attached to a VM in `hostdev` mode, the virtual PCIe NIC device itself becomes hidden from the node, and is visible only to the guest, where it appears as a discrete PCIe device. In this mode, vLANs and other attributes can be set on the guest side at will, though setting vLANs and other properties in the VF configuration is still supported. The main caveat to this mode is that VMs with connected `hostdev` SR-IOV VFs *cannot be live migrated between nodes*. Only a `shutdown` migration is supported, and, like `macvtap`, an identical PCIe device at the same bus address must be present on the target node. To prevent unexpected failures, PVC will explicitly set the VM metadata for the "migration method" to "shutdown" the first time that a `hostdev` VF is attached to it; if this changes later, the administrator must change this back explicitly.
|
||||
|
||||
Generally speaking, SR-IOV connections are not recommended unless there is a good use-case for them. On modern hardware, software bridges are extremely performant, and are much simpler to manage. The functionality is provided for those rare use-cases where SR-IOV is absolutely required by the administrator, but care must be taken to understand all the requirements and caveats of SR-IOV before using it in production.
|
||||
|
||||
#### Other Client Networks
|
||||
|
||||
Future PVC versions may support other client network types, such as direct-routing between VMs.
|
||||
|
||||
### Fencing and Recovery
|
||||
|
||||
Self-management and self-healing are important components of PVC's design, and to accomplish this, PVC contains automated fencing and recovery functions to handle situations where nodes crash or become unreachable. PVC is then able, if properly configured, to directly power-cycle the failed node, and bring up any VMs that were running on it on the remaining hypervisors. This ensures that, while there might be a few minutes of downtime for VMs, they are recovered as quickly as possible without human intervention.
|
||||
|
||||
To operate correctly, these functions require each node in the cluster to have a functional IPMI-over-IP setup with a configured user who is able to perform chassis power commands. This differs depending on the chassis manufacturer and model, and should be tested prior to deploying any production cluster. If IPMI is not configured correctly at node startup, the daemon will warn and disable automatic recovery of the node. The IPMI should be present in the Upstream system network (see [System Networks](#system-networks) above), or in another secured network which is reachable from the Upstream system network, whichever is more convenient for the layout of the networks.
|
||||
|
||||
The general process is divided into 3 sections: detecting node failures, fencing nodes, and recovering from fenced nodes. Note that this process only applies to nodes in the `run` "daemon state"; if a node daemon cleanly shuts down (for instance due to a service restart or administrative action), it will not be fenced.
|
||||
|
||||
#### Detecting Failed Nodes
|
||||
|
||||
Within the PVC configuration, each node has 3 settings which determine the failure detection time. The first is the `keepalive_interval` setting. This is normally set to 5 seconds, and is the interval at which the node daemon of each node sends its keepalives (as well as gathers statistics about running VMs, Ceph components, etc.). This interval should never need to be changed, but is configurable for maximum flexibility in corner cases. During each keepalive, the node updates a specific key in the Zookeeper cluster with the current UNIX timestamp, which determines when the node was last alive. During their own keepalives, the other nodes check their peers' timestamps to confirm if they are updating normally. Note that, due to this happening during the peer keepalives, if all nodes lose contact with the Zookeeper database, they will *not* immediately begin fencing each other, since the keepalives will not complete; they will, however, upon recovery, jump immediately to the next section when they all realize that their last keepalives were over the threshold, and this situation is discussed there.
|
||||
|
||||
The second option is the `fence_intervals` setting. This option determines how many keepalive intervals a node can miss before it is marked `dead` and a fencing sequence started. This is normally set to 6 intervals, which combined with the 5 second `keepalive_interval`, gives a total of 30 seconds (+/- up to another 5 second `keepalive_interval` for peers should they not line up) for the node to be without updates before fencing begins.
|
||||
|
||||
The third setting is optional, and is best used in situations where the IPMI connectivity of a node is excessively flaky or can be impaired (e.g. georedundant clusters), or where VM uptime is more important than the burden of recovering from a split-brain situation, and is not as extensively tested. This option is `suicide_intervals`, and if set to a non-0 value, is the number of keepalive intervals before a node *itself* determines that it should forcibly power itself off, which should always be equal to or less than the normal `fence_intervals` setting. Naturally, the node must be somewhat functional to do this, and this can go very wrong, so using this option is not normally recommended.
|
||||
|
||||
#### Fencing Nodes
|
||||
|
||||
Once the cluster, and specifically one node in the cluster, has determined that a given node is `dead` due to a lack of keepalives, the fencing process starts. This spawns a dedicated child thread within the node daemon of the detecting node, which continually monitors the state of the `dead` node and then performs the fence.
|
||||
|
||||
During the `dead` process, the failed node has 6 chances, called "saving throws", at `keepalive_interval` second windows, to send another keepalive before it is fenced. This additional, fixed, delay helps ensure that the cluster will gracefully recover from intermittent network failures or loss of Zookeeper contact, by providing nodes up to another 6 keepalive intervals to save themselves once the fence timer actually begins. This bring the total time, with default options, of a node stopping contact to a node being fenced, to between 60 and 65 seconds. This duration is considered by the author an acceptable compromise between speedy recovery and avoiding false positives (and hence larger outages).
|
||||
|
||||
Once a node has been marked `dead` and has failed its 6 "saving throws", the fence process triggers an IPMI chassis reset sequence. First, the node is issued an IPMI `chassis power off` command to trigger a cold system shutdown. Next, it waits a fixed 1 second and then checks and logs the current `chassis power state`, and then issues a `chassis power on` signal to start up the node. It then finally waits a fixed 2 seconds, and then checks the current `chassis power status`. Using the results of these 3 commands, PVC is then able to determine with near certainty whether the node has truly been forced offline or not, and it can proceed to the next step.
|
||||
|
||||
#### Recovery from Node Fences
|
||||
|
||||
Once a node has been fenced, successfully or not, the system waits for one keepalive interval before proceeding.
|
||||
|
||||
The cluster then determines what to do based both on the result of the fence (whether the node was determined to have been successfully cold-reset or not) and on two additional configuration values. The first, `successful_fence`, specifies what action to take when the fence was successful, and is either `migrate` (VMs to other nodes), the default, or `None` (no action). The second, `failed_fence`, is an identical choice for when the fence was unsuccessful, and defaults to `None`.
|
||||
|
||||
If the fence was successful and `successful_fence` is set to `None`, then no migration takes place and the VMs on the fenced node will remain offline until the node recovers. If instead `successful_fence` is set to the default of `migrate`, the system will then begin migrating (and hence, starting) VMs that were active on the failed node to other nodes in the cluster. During this special `fence-flush` action, any stale RBD locks on the storage volumes are forcibly cleared, and this is considered safe since the fenced node is determined to have successfully been powered off and the VMs thus terminated. Once all VMs are migrated, the fenced node will then be set to a normal `flushed` state, as if it had been cleanly flushed before powering off. If and when the node returns to active, healthy service, either automatically (if the reset cleared the fault condition) or after human intervention, VMs can then migrate back and the cluster can resume normal operation; otherwise the cluster will remain in the degraded state until corrected.
|
||||
|
||||
If the fence was unsuccessful and `failed_fence` is set to the default of `None`, no automatic recovery takes place, since the cluster cannot determine that it is safe to do so. This would most commonly occur during network partitions where the `dead` node potentially remains up with VMs running on it, and the cluster is now in a split-brain situation. The `suicide_interval` option mentioned above is provided for this specific situation, and would allow the administrator to set the `failed_fence` action to `migrate` as well, as they could be somewhat confident that the node will have forcibly terminated itself. However due to the inherent potential for danger in this scenario, it is recommended to leave these options at their defaults, and handle such situations manually instead, as well as ensuring proper network design to avoid the potential for such split-brain situations to occur.
|
||||
|
||||
## Advanced Layouts
|
||||
|
||||
### Coordinators versus Hypervisors
|
||||
|
||||
While a normal basic PVC cluster would consist of 3, or perhaps 5, nodes, PVC is able to scale up much further by differentiating between "coordinator" and "hypervisor" nodes. Such a basic cluster would consist only of coordinator nodes. Scaling up however, it is prudent to add new nodes as hypervisor nodes instead to minimize database scaling problems.
|
||||
|
||||
#### Coordinators
|
||||
|
||||
Coordinators are a special set of 3 or 5 nodes with additional functionality. The coordinator nodes run, in addition to the PVC software itself, a number of databases and additional functions which are required by the whole cluster. An odd number of coordinators is *always* required to maintain quorum, though there are diminishing returns when creating more than 3. As mentioned above, generally for small clusters all nodes are coordinators.
|
||||
|
||||
These additional functions are:
|
||||
|
||||
0. The Zookeeper database cluster containing the cluster state and configuration
|
||||
0. The Patroni PostgreSQL database cluster containing DNS records for managed networks and provisioning configurations
|
||||
0. The FRR EBGP route reflectors and upstream BGP peers
|
||||
|
||||
In addition to these functions, coordinators can usually also run all other PVC node functions.
|
||||
|
||||
The set of coordinator nodes is generally configured at cluster bootstrap, initially with 3 nodes, which are then bootstrapped together to form a basic 3-node cluster. Additional nodes, either as coordinators or as hypervisors, can then be added to the running cluster to bring it up to its final size, either immediately or as the needs of the cluster change.
|
||||
|
||||
##### The Primary Coordinator
|
||||
|
||||
Within the set of coordinators, a single primary coordinator is elected at cluster startup and as nodes start and stop, or in response to administrative commands. Once a node becomes primary, it will remain so until it stops or is told not to be. This coordinator is responsible for some additional functionality in addition to the other coordinators. These additional functions are:
|
||||
|
||||
0. The floating IPs in the main networks
|
||||
0. The default gateway IP for each managed client network
|
||||
0. The DNSMasq instance handling DHCP and DNS for each managed client network
|
||||
0. The API and provisioner clients and workers
|
||||
|
||||
PVC gracefully handles transitioning primary coordinator state, to minimize downtime. Workers will continue to operate on the old coordinator if available after a switchover and the administrator should be aware of any active tasks before switching the active primary coordinator.
|
||||
|
||||
#### Hypervisors
|
||||
|
||||
Hypervisor nodes do not run any of the database or routing functionality of coordinator nodes, nor can they become the primary coordinator node (for obvious reasons). When scaling a cluster up beyond the initial 3, or perhaps 5, coordinator nodes, or when an even number of nodes (e.g. 4) may be desired, any nodes beyond the 3 coordinators should be added as hypervisors.
|
||||
|
||||
Hypervisor nodes are capable of running VMs and Ceph OSD disks, just like coordinator nodes, though the latter is optional.
|
||||
|
||||
PVC has no limit to the number of hypervisor nodes that can connect to a set of coordinators, though beyond a dozen or so total nodes, a more scale-focused infrastructure solution may be warranted.
|
||||
|
||||
### Georedundancy
|
||||
|
||||
PVC supports geographic redundancy of nodes in order to facilitate disaster recovery scenarios when uptime is critical. Functionally, PVC behaves the same regardless of whether the 3 or more coordinators are in the same physical location, or remote physical locations.
|
||||
|
||||
When using geographic redundancy, there are several caveats to keep in mind:
|
||||
|
||||
* The Ceph storage subsystem is latency-sensitive. With the default replication configuration, at least 2 writes must succeed for the write to return a success, so the total write latency of a write on any system will be equal to the maximum latency between any two nodes. It is recommended to keep all PVC nodes as "close" as possible latency-wise or storage performance may suffer.
|
||||
|
||||
* The inter-node PVC networks (see [System Networks](#system-networks)) must be layer-2 networks (broadcast domains). These networks must be spanned to all nodes in all locations.
|
||||
|
||||
* The number of sites and positioning of coordinators at those sites is important. A majority (at least 2 in a 3-coordinator cluster, or 3 in a 5-coordinator cluster) of coordinators must be able to reach each other in a failure scenario for the cluster as a whole to remain functional. Thus, configurations such as 2 + 1 or 3 + 2 splits across 2 sites do *not* provide full redundancy, and the whole cluster will be down if the majority site is down. It is thus recommended to always have an odd number of sites to match the odd number of coordinators, for instance a 1 + 1 + 1 or 2 + 2 + 1 configuration. Also note that all hypervisors much be able to reach the majority coordinator group or their storage will be impacted as well.
|
||||
|
||||
This diagram outlines the supported and unsupported/unreliable georedundant configurations for 3 nodes. Care must always be taken to ensure that the cluster can operate with the loss of any given georeundant site.
|
||||
|
||||

|
||||
|
||||
*Above: Supported and unsupported/unreliable georedundant configurations*
|
||||
|
||||
* Even if the PVC software itself is in an unmanageable state, VMs will continue to run if at all possible. However, since the storage subsystem makes use of the same quorum, losing more than half of the coordinator nodes will very likely result in storage interruption as well, which will affect running VMs.
|
||||
|
||||
* Nodes in remote geographic locations might not be able to be fenced by the remaining PVC nodes if the entire site is unreachable. The cluster will thus be unable to automatically recover VMs at the failed site should it go down. If at all possible, redundant links to georedundant sites are recommended to ensure there is always a network path. Note that the `suicide_interval` configuration option, while it might seem to help here, will not, because the remaining nodes will not be able to reliably confirm if the remote site actually *did* shut itself off. Thus automatic failover of georedundant sides is a potential deficiency that must be considered.
|
||||
|
||||
If these requirements cannot be fulfilled, it may be best to have separate PVC clusters at each site and handle service redundancy at a higher layer to avoid a major disruption.
|
||||
|
||||
## Example System Diagrams
|
||||
|
||||
This section provides diagrams of 2 best-practice cluster configurations. These diagrams can be extrapolated out to almost any possible configuration and number of nodes.
|
||||
|
||||
#### Small 3-node cluster
|
||||
|
||||
[](/images/pvc-3-node-cluster.png)
|
||||
|
||||
*Above: A diagram of a simple 3-node cluster with all nodes as coordinators. Dual 10 Gbps network interface per node, unified physical networking with collapsed cluster and storage networks.*
|
||||
|
||||
#### Large 8-node cluster
|
||||
|
||||
[](/images/pvc-8-node-cluster.png)
|
||||
|
||||
*Above: A diagram of a large 8-node cluster with 3 coordinators and 5 hypervisors. Quad 10Gbps network interfaces per node, split physical networking into guest/cluster and storage networks.*
|
@ -1,145 +0,0 @@
|
||||
# Getting started - deploying a Parallel Virtual Cluster
|
||||
|
||||
PVC aims to be easy to deploy, letting you get on with managing your cluster in just a few hours at most. Once initial setup is complete, the cluster is managed via the clients, though the Ansible framework is used to add, remove, or modify nodes as required.
|
||||
|
||||
This guide will walk you through setting up a simple 3-node PVC cluster from scratch, ending with a fully-usable cluster ready to provision virtual machines. Note that all domains, IP addresses, etc. used are examples - when following this guide, be sure to modify the commands and configurations to suit your needs.
|
||||
|
||||
### Part One - Preparing for bootstrap
|
||||
|
||||
0. Read through the [Cluster Architecture documentation](/cluster-architecture). This documentation details the requirements and conventions of a PVC cluster, and is important to understand before proceeding.
|
||||
|
||||
0. Download the latest copy of the [`pvc-ansible`](https://github.com/parallelvirtualcluster/pvc-ansible) repository to your local machine.
|
||||
|
||||
0. Leverage the `create-local-repo.sh` script in the `pvc-ansible` directory to set up a local cluster configuration directory; follow the instructions the script provides, as all future steps will be done inside your new local configuration directory.
|
||||
|
||||
0. Create an initial `hosts` inventory, using `hosts.default` in the `pvc-ansible` repo as a template. You can manage multiple PVC clusters ("sites") from the Ansible repository easily, however for simplicity you can use the simple name `cluster` for your initial site. Define the 3 hostnames you will use under the site group; usually the provided names of `pvchv1`, `pvchv2`, and `pvchv3` are sufficient, though you may use any hostname pattern you wish. It is *very important* that the names all contain a sequential number, however, as this is used by various components.
|
||||
|
||||
0. Create an initial set of `group_vars` for your cluster at `group_vars/<cluster>`, using the `group_vars/default` in the `pvc-ansible` repo as a template. Inside these group vars are two main files: `base.yml` and `pvc.yml`. These example files are well-documented; read them carefully and specify all required options before proceeding, and reference the [Ansible setup examples](https://github.com/parallelvirtualcluster/pvc-ansible) for more detailed descriptions of the options.
|
||||
|
||||
* `base.yml` configures the `base` role and some common per-cluster configurations such as an upstream domain, a root password, a set of administrative users, various hardware configuration items, as well as and most importantly, the basic network configuration of the nodes. Make special note of the various items that must be generated such as passwords; these should all be cluster-unique.
|
||||
|
||||
* `pvc.yml` configures the `pvc` role, including all the dependent software and PVC itself. Important to note is the `pvc_nodes` list, which contains a list of all the nodes as well as per-node configurations for each. All nodes must be a part of this list.
|
||||
|
||||
0. In the `pvc-installer` directory, run the `buildiso.sh` script to generate an installer ISO. This script requires `debootstrap`, `isolinux`, and `xorriso` to function. The resulting file will, by default, be named `pvc-installer_<date>.iso` in the current directory. For additional options, use the `-h` flag to show help information for the script.
|
||||
|
||||
### Part Two - Preparing and installing the physical hosts
|
||||
|
||||
0. Prepare 3 physical servers with IPMI. The servers should match the specifications and requirements outlined in the [Cluster Architecture documentation](/cluster-architecture). Connect their networking based on the configuration set in the `base.yml` group vars file for your cluster.
|
||||
|
||||
0. Load the installer ISO generated in step 6 of the previous section onto a USB stick, or using IPMI virtual media, on the physical servers.
|
||||
|
||||
0. Boot the physical servers off of the installer ISO. Use UEFI mode - if available - for maximum flexibility and longevity.
|
||||
|
||||
0. Follow the prompts from the installer ISO. It will ask for a hostname, the system disk device to use, the initial network interface to configure as well as vLANs and either DHCP or static IP information, and finally either an HTTP URL containing an SSH `authorized_keys` to use for the `deploy` user, or a password for this user if key auth is unavailable.
|
||||
|
||||
0. Wait for the installer to complete. This may take several minutes.
|
||||
|
||||
0. At the end of the install process, follow the prompts carefully; it is usually prudent to pre-see the `/etc/network/interfaces` configuration based on your expected final physical network config (e.g. set up bonding, etc.) before proceeding, especially if you use DHCP, as the bonding configuration applied later could affect the address. The `chroot` is likely unneeded unless you have good reason to edit the system in this way.
|
||||
|
||||
0. Make note of the (temporary and insecure!) root password set by the installer; you may need it to troubleshoot the system if it does not come up properly. This will be overwritten later in the setup process.
|
||||
|
||||
0. Press "Enter" to reboot the system and confirm it is reachable.
|
||||
|
||||
0. Repeat the above steps for all 3 initial nodes. On boot, they will display their configured IP address to be used in the next steps.
|
||||
|
||||
### Part Three - Initial bootstrap with Ansible
|
||||
|
||||
0. Make note of the IP addresses of all 3 initial nodes, and configure DNS, `/etc/hosts`, or Ansible `ansible_host=` hostvars to map these IP addresses to the hostnames set in the Ansible `hosts` and `group_vars` files.
|
||||
|
||||
0. Verify connectivity from your administrative host to the 3 initial nodes, including SSH access as the `deploy` user. Accept their host keys as required before proceeding as Ansible does not like those prompts. If you did not configure SSH key auth during the PVC installer process, configure it now, as it greatly simplifies Ansible configuration.
|
||||
|
||||
0. Verify your `group_vars` setup from part 1, as errors here may require a re-installation and restart of the bootstrap process.
|
||||
|
||||
0. Perform the initial bootstrap. From your local configuration repository directory, execute the following `ansible-playbook` command, replacing `<cluster_name>` with the Ansible group name from the `hosts` file. Make special note of the additional `bootstrap=yes` variable, which tells the playbook that this is an initial bootstrap run.
|
||||
`$ ansible-playbook -v -i hosts pvc.yml -l <cluster_name> -e bootstrap=yes`
|
||||
|
||||
**WARNING:** Never run this playbook with the `-e bootstrap=yes` option against an active, already-bootstrapped cluster. This will have **disastrous consequences** including the **loss of all data** in the Ceph system as well as any configured networks, VMs, etc.
|
||||
|
||||
0. Wait for the Ansible playbook run to finish. Once completed, the cluster bootstrap will be finished, and all 3 nodes will have rebooted into a working PVC cluster. If any errors occur, carefully evaluate them and re-run the playbook (with `-o bootstrap=yes` - your cluster is not active yet!) as required.
|
||||
|
||||
0. Download and install the CLI client package (`pvc-client-cli.deb`) on your administrative host, and add and verify connectivity to the cluster; this will also verify that the API is working. You will need to know the cluster upstream floating IP address you configured in the `networks` section of the `base.yml` playbook, and if you configured SSL or authentication for the API in your `group_vars`, adjust the first command as needed (see `pvc cluster add -h` for details). A human-readable description can also be specified, which is useful if you manage multiple clusters and their names become unweildy.
|
||||
`$ pvc cluster add -a <upstream_floating_ip> -d "My first PVC cluster" mycluster`
|
||||
`$ pvc -c mycluster node list`
|
||||
|
||||
You can also set a default cluster by exporting the `PVC_CLUSTER` environment variable to avoid requiring `-c cluster` with every subsequent command:
|
||||
`$ export PVC_CLUSTER="mycluster"`
|
||||
|
||||
**Note:** It is fully possible to administer the cluster from the nodes themselves via SSH should you so choose, to avoid requiring the PVC client on your local machine.
|
||||
|
||||
### Part Four - Configuring the Ceph storage cluster
|
||||
|
||||
0. Determine the Ceph OSD block devices on each host via an `ssh` shell. For instance, use `lsblk` or check `/dev/disk/by-path` to show the block devices by their physical SAS/SATA bus location, and obtain the relevant `/dev/sdX` name for each disk you wish to be a Ceph OSD on each host.
|
||||
|
||||
0. Cofigure an OSD device for each data disk in each host. The general command is:
|
||||
`$ pvc storage osd add --weight <weight> <node> <device>`
|
||||
|
||||
For example, if each node has two data disks, as `/dev/sdb` and `/dev/sdc`, run the commands as follows to add the first disk to each node, then the second disk to each node:
|
||||
`$ pvc storage osd add --weight 1.0 pvchv1 /dev/sdb`
|
||||
`$ pvc storage osd add --weight 1.0 pvchv2 /dev/sdb`
|
||||
`$ pvc storage osd add --weight 1.0 pvchv3 /dev/sdb`
|
||||
`$ pvc storage osd add --weight 1.0 pvchv1 /dev/sdc`
|
||||
`$ pvc storage osd add --weight 1.0 pvchv2 /dev/sdc`
|
||||
`$ pvc storage osd add --weight 1.0 pvchv3 /dev/sdc`
|
||||
|
||||
**NOTE:** On the CLI, the `--weight` argument is optional, and defaults to `1.0`. In the API, it must be specified explicitly, but the CLI sets a default value. OSD weights determine the relative amount of data which can fit onto each OSD. Under normal circumstances, you would want all OSDs to be of identical size, and hence all should have the same weight. If your OSDs are instead different sizes, the weight should be proportional to the size, e.g. `1.0` for a 100GB disk, `2.0` for a 200GB disk, etc. For more details, see the [Cluster Architecture](/cluster-architecture) and Ceph documentation.
|
||||
|
||||
**NOTE:** OSD commands wait for the action to complete on the node, and can take some time (up to 30 seconds).
|
||||
|
||||
**NOTE:** You can add OSDs in any order you wish, for instance you can add the first OSD to each node and then add the second to each node, or you can add all nodes' OSDs together at once like the example. This ordering does not affect the cluster in any way.
|
||||
|
||||
0. Verify that the OSDs were added and are functional (`up` and `in`):
|
||||
`$ pvc storage osd list`
|
||||
|
||||
0. Create an RBD pool to store VM images on. The general command is:
|
||||
`$ pvc storage pool add <name> <placement_groups>`
|
||||
|
||||
**NOTE:** Ceph placement groups are a complex topic; as a general rule it's easier to grow than shrink, so start small and grow as your cluster grows. The following are some good starting numbers for 3-node clusters, though the Ceph documentation and the [Ceph placement group calculator](https://ceph.com/pgcalc/) are advisable for anything more complex. There is a trade-off between CPU usage and the number of total PGs for all pools in the cluster, with more PGs meaning more CPU usage.
|
||||
|
||||
* 3 OSDs total: 128 PGs (1 pool) or 64 PGs (2 or more pools, each)
|
||||
* 6 OSDs total: 256 PGs (1 pool) or 128 PGs (2 or more pools, each)
|
||||
* 9+ OSDs total: 256 PGs
|
||||
|
||||
For example, to create a pool named `vms` with 256 placement groups, run the command as follows:
|
||||
`$ pvc storage pool add vms 256`
|
||||
|
||||
**NOTE:** As detailed in the [cluster architecture documentation](/cluster-architecture), you can also set a custom replica configuration for each pool if the default of 3 replica copies with 2 minimum copies is not acceptable. See `pvc storage pool add -h` or that document for full details.
|
||||
|
||||
0. Verify that the pool was added:
|
||||
`$ pvc storage pool list`
|
||||
|
||||
### Part Five - Creating virtual networks
|
||||
|
||||
0. Determine a domain name and IPv4, and/or IPv6 network for your first client network, and any other client networks you may wish to create. These networks must not overlap with the cluster networks. For full details on the client network types, see the [cluster architecture documentation](/cluster-architecture).
|
||||
|
||||
0. Create the virtual network. There are many options here, so see `pvc network add -h` for details.
|
||||
|
||||
For example, to create the managed (EVPN VXLAN) network `100` with subnet `10.100.0.0/24`, gateway `.1` and DHCP from `.100` to `.199`, run the command as follows:
|
||||
`$ pvc network add 100 --type managed --description my-managed-network --domain myhosts.local --ipnet 10.100.0.0/24 --gateway 10.100.0.1 --dhcp --dhcp-start 10.100.0.100 --dhcp-end 10.100.0.199`
|
||||
|
||||
For another example, to create the static bridged (switch-configured, tagged VLAN, with no PVC management of IPs) network `200`, run the command as follows:
|
||||
`$ pvc network add 200 --type bridged --description my-bridged-network`
|
||||
|
||||
**NOTE:** Network descriptions cannot contain spaces or special characters; keep them short, sweet, and dash or underscore delimited.
|
||||
|
||||
0. Verify that the network(s) were added:
|
||||
`$ pvc network list`
|
||||
|
||||
0. On the upstream router, configure one of:
|
||||
|
||||
a) A BGP neighbour relationship with the cluster upstream floating address to automatically learn routes.
|
||||
|
||||
b) Static routes for the configured client IP networks towards the cluster upstream floating address.
|
||||
|
||||
0. On the upstream router, if required, configure NAT for the configured client IP networks.
|
||||
|
||||
0. Verify the client networks are reachable by pinging the managed gateway from outside the cluster.
|
||||
|
||||
|
||||
### You're Done!
|
||||
|
||||
0. Set all 3 nodes to `ready` state, allowing them to run virtual machines. The general command is:
|
||||
`$ pvc node ready <node>`
|
||||
|
||||
Congratulations, you now have a basic PVC storage cluster, ready to run your VMs.
|
||||
|
||||
For next steps, see the [Provisioner manual](/manuals/provisioner) for details on how to use the PVC provisioner to create new Virtual Machines, as well as the [CLI manual](/manuals/cli) and [API manual](/manuals/api) for details on day-to-day usage of PVC.
|
Before Width: | Height: | Size: 129 KiB |
Before Width: | Height: | Size: 134 KiB |
Before Width: | Height: | Size: 148 KiB |
Before Width: | Height: | Size: 88 KiB |
Before Width: | Height: | Size: 41 KiB |
Before Width: | Height: | Size: 300 KiB |
Before Width: | Height: | Size: 42 KiB |
Before Width: | Height: | Size: 40 KiB |
Before Width: | Height: | Size: 49 KiB |
Before Width: | Height: | Size: 49 KiB |
@ -1,58 +0,0 @@
|
||||
<p align="center">
|
||||
<img alt="Logo banner" src="images/pvc_logo_black.png"/>
|
||||
<br/><br/>
|
||||
<a href="https://github.com/parallelvirtualcluster/pvc"><img alt="License" src="https://img.shields.io/github/license/parallelvirtualcluster/pvc"/></a>
|
||||
<a href="https://github.com/parallelvirtualcluster/pvc/releases"><img alt="Release" src="https://img.shields.io/github/release-pre/parallelvirtualcluster/pvc"/></a>
|
||||
<a href="https://parallelvirtualcluster.readthedocs.io/en/latest/?badge=latest"><img alt="Documentation Status" src="https://readthedocs.org/projects/parallelvirtualcluster/badge/?version=latest"/></a>
|
||||
</p>
|
||||
|
||||
## What is PVC?
|
||||
|
||||
PVC is a Linux KVM-based hyperconverged infrastructure (HCI) virtualization cluster solution that is fully Free Software, scalable, redundant, self-healing, self-managing, and designed for administrator simplicity. It is an alternative to other HCI solutions such as Harvester, Nutanix, and VMWare, as well as to other common virtualization stacks such as ProxMox and OpenStack.
|
||||
|
||||
PVC is a complete HCI solution, built from well-known and well-trusted Free Software tools, to assist an administrator in creating and managing a cluster of servers to run virtual machines, as well as self-managing several important aspects including storage failover, node failure and recovery, virtual machine failure and recovery, and network plumbing. It is designed to act consistently, reliably, and unobtrusively, letting the administrator concentrate on more important things.
|
||||
|
||||
PVC is highly scalable. From a minimum (production) node count of 3, up to 12 or more, and supporting many dozens of VMs, PVC scales along with your workload and requirements. Deploy a cluster once and grow it as your needs expand.
|
||||
|
||||
As a consequence of its features, PVC makes administrating very high-uptime VMs extremely easy, featuring VM live migration, built-in always-enabled shared storage with transparent multi-node replication, and consistent network plumbing throughout the cluster. Nodes can also be seamlessly removed from or added to service, with zero VM downtime, to facilitate maintenance, upgrades, or other work.
|
||||
|
||||
PVC also features an optional, fully customizable VM provisioning framework, designed to automate and simplify VM deployments using custom provisioning profiles, scripts, and CloudInit userdata API support.
|
||||
|
||||
Installation of PVC is accomplished by two main components: a [Node installer ISO](https://github.com/parallelvirtualcluster/pvc-installer) which creates on-demand installer ISOs, and an [Ansible role framework](https://github.com/parallelvirtualcluster/pvc-ansible) to configure, bootstrap, and administrate the nodes. Installation can also be fully automated with a companion [cluster bootstrapping system](https://github.com/parallelvirtualcluster/pvc-bootstrap). Once up, the cluster is managed via an HTTP REST API, accessible via a Python Click CLI client or WebUI.
|
||||
|
||||
Just give it physical servers, and it will run your VMs without you having to think about it, all in just an hour or two of setup time.
|
||||
|
||||
|
||||
## What is it based on?
|
||||
|
||||
The core node and API daemons, as well as the CLI API client, are written in Python 3 and are fully Free Software (GNU GPL v3). In addition to these, PVC makes use of the following software tools to provide a holistic hyperconverged infrastructure solution:
|
||||
|
||||
* Debian GNU/Linux as the base OS.
|
||||
* Linux KVM, QEMU, and Libvirt for VM management.
|
||||
* Linux `ip`, FRRouting, NFTables, DNSMasq, and PowerDNS for network management.
|
||||
* Ceph for storage management.
|
||||
* Apache Zookeeper for the primary cluster state database.
|
||||
* Patroni PostgreSQL manager for the secondary relation databases (DNS aggregation, Provisioner configuration).
|
||||
|
||||
|
||||
## Getting Started
|
||||
|
||||
To get started with PVC, please see the [About](https://parallelvirtualcluster.readthedocs.io/en/latest/about/) page for general information about the project, and the [Getting Started](https://parallelvirtualcluster.readthedocs.io/en/latest/getting-started/) page for details on configuring your first cluster.
|
||||
|
||||
|
||||
## Changelog
|
||||
|
||||
View the changelog in [CHANGELOG.md](https://github.com/parallelvirtualcluster/pvc/blob/master/CHANGELOG.md).
|
||||
|
||||
|
||||
## Screenshots
|
||||
|
||||
While PVC's API and internals aren't very screenshot-worthy, here is some example output of the CLI tool.
|
||||
|
||||
<p><img alt="Node listing" src="images/pvc-nodes.png"/><br/><i>Listing the nodes in a cluster</i></p>
|
||||
|
||||
<p><img alt="Network listing" src="images/pvc-networks.png"/><br/><i>Listing the networks in a cluster, showing 3 bridged and 1 IPv4-only managed networks</i></p>
|
||||
|
||||
<p><img alt="VM listing and migration" src="images/pvc-migration.png"/><br/><i>Listing a limited set of VMs and migrating one with status updates</i></p>
|
||||
|
||||
<p><img alt="Node logs" src="images/pvc-nodelog.png"/><br/><i>Viewing the logs of a node (keepalives and VM [un]migration)</i></p>
|
@ -1,13 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>PVC Client API Documentation</title>
|
||||
<meta charset="utf-8"/>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<style> body { margin: 0; padding: 0; } </style>
|
||||
</head>
|
||||
<body>
|
||||
<redoc spec-url='./swagger.json' hide-loading></redoc>
|
||||
<script src="https://rebilly.github.io/ReDoc/releases/latest/redoc.min.js"> </script>
|
||||
</body>
|
||||
</html>
|
@ -1,355 +0,0 @@
|
||||
# PVC API architecture
|
||||
|
||||
The PVC API is a standalone client application for PVC. It interfaces directly with the Zookeeper database to manage state.
|
||||
|
||||
The API is built using Flask and is packaged in the Debian package `pvc-client-api`. The API depends on the common client functions of the `pvc-client-common` package as does the CLI client.
|
||||
|
||||
Details of the API interface can be found in [the manual](/manuals/api).
|
||||
|
||||
# PVC HTTP API manual
|
||||
|
||||
The PVC HTTP API client is built with Flask, a Python framework for creating API interfaces, and run directly with the PyWSGI framework. It interfaces directly with the Zookeeper cluster to send and receive information about the cluster. It supports authentication configured statically via tokens in the configuration file as well as SSL. It also includes the provisioner client, an optional section that can be used to create VMs automatically using a set of templates and standardized scripts.
|
||||
|
||||
The [`pvc-ansible`](https://github.com/parallelvirtualcluster/pvc-ansible) framework will install and configure the API by default, and enable the node daemon option for an instance of the API to follow the primary node, thus ensuring the API is listening on the upstream floating IP at all times.
|
||||
|
||||
## API Details
|
||||
|
||||
### SSL
|
||||
|
||||
The API accepts SSL certificate and key files via the `pvcapid.yaml` configuration to enable SSL support for the API, which protects the data and query values from snooping or tampering. SSL is strongly recommended if using the API outside of a trusted local area network.
|
||||
|
||||
### API authentication
|
||||
|
||||
Authentication for the API is available using a static list of tokens. These tokens can be any long string, but UUIDs are typical and simple to use. Within `pvc-ansible`, the list of tokens can be specified in the `pvc.yaml` `group_vars` file. Usually, you'd want one token for each user of the API, such as a WebUI, a 3rd-party client, or an administrative user. Within the configuration, each token can have a description; this is mostly for administrative clarity and is not actually used within the API itself.
|
||||
|
||||
The API provides session-based login using the `/api/v1/auth/login` and `/api/v1/auth/logout` options. If authentication is not enabled, these endpoints return a temporary redirect to the root (version) endpoint.
|
||||
|
||||
For one-time authentication, the `token` value can be specified to any API endpoint via the `X-Api-Key` header value. This is only checked if there is no valid session already established. If authentication is enabled, there is no valid session, and no `token` value is specified, the API will return a JSON `message` of `Authentication required` and HTTP code 401.
|
||||
|
||||
### Data formats
|
||||
|
||||
The PVC API consistently accepts HTTP POST commands of HTML form documents.
|
||||
|
||||
The PCI API consistently returns JSON bodies as its responses. For (most) POST endpoints and any failures (400, 401, 404, etc.), this body contains a "message" field with a text message indicating the result. For (most) GET endpoints, this body is a JSON representation of the data being provided, subject to the schema outlined in the API endpoint documentation.
|
||||
|
||||
## Provisioner
|
||||
|
||||
The provisioner subsection (`/api/v1/provisioner`) is used to create new virtual machines on a PVC cluster. By creating templates and scripts, then grouping these into profiles, VMs can be created based on dynamic, declarative configurations via direct installation or templating. Administrators can use this facility to automate the creation of VMs running most UNIX-like operating systems that can be installed in a parent host. It can also create VMs based on existing templates or ISO images to facilitate installing alternate operating systems such as Microsoft Windows.
|
||||
|
||||
### Templates
|
||||
|
||||
Templates are used to configure the four components that define a VM configuration. Templates can be created and managed via the API, then grouped into profiles.
|
||||
|
||||
#### System Templates
|
||||
|
||||
System templates define the basic configuration of a VM. This includes the number of vCPUs and amount vRAM, as well as console access (either VNC or serial) and several pieces of PVC metadata.
|
||||
|
||||
Generally, a system template is usable across multiple VM profiles, so there will generally be a small number of system templates defining several standard resource profiles that can then be reused.
|
||||
|
||||
Some elements of the system template are mandatory, but most are optional.
|
||||
|
||||
###### Example: Creating a system template
|
||||
|
||||
* Note: vRAM sizes are always specified in MB.
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/system?name=2cpu-1gb-serial\&vcpus=2\&vram=1024\&serial=true\&vnc=false\&node_limit='pvchv1,pvchv2'\&node_selector=mem\&start_with_node=false
|
||||
curl -X GET http://localhost:7370/api/v1/provisioner/template/system/2cpu-1gb-serial
|
||||
```
|
||||
|
||||
#### Network Templates
|
||||
|
||||
Network templates define the network configuration of a VM. These are tied into the PVC networking facility, and are quite simple. A MAC template is assigned to each template, which defines how MAC addresses are generated (either randomly, or via a simple templating system for static MAC addresses).
|
||||
|
||||
With a network template, various "nets" can be configured. A "net" defines a PVC virtual network VNI, which must be valid on the PVC cluster. The first net is assigned to the first Ethernet device (usually eth0 or ens2 in Linux), with each subsequent network being added as an additional interface in order.
|
||||
|
||||
###### Example: Creating a network template with two networks
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/network?name=net200+net300
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/network/net200+net300/net?vni=200
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/network/net200+net300/net/300
|
||||
curl -X GET http://localhost:7370/api/v1/provisioner/template/net200+net300
|
||||
```
|
||||
|
||||
#### Storage Templates
|
||||
|
||||
Storage templates define the Ceph RBD disks, as well as optional filesystems and mountpoints for Linux-based guests, of a VM. The template itself consists only of a name; disk or image entries are configured as additional elements similar to network templates.
|
||||
|
||||
Each disk in a storage template is identified by a sequential ID, usually "sda"/"vda", "sdb"/"vdb", etc., a size, and a Ceph RBD pool within the PVC cluster. These alone are all that are required, and will create raw, unformatted images of the specified size, on the specified pool, and attached to the VM at the ID value. In addition to these basics, filesystems (with argument support) and mountpoints can also be specified. Filesystems specified here will be used to format the volume during the provisioning process, and mountpoints will mount the volume at the specified mountpoint during provisioning, so that a guest operating system can be installed on them during the process with a provisioning script.
|
||||
|
||||
In addition to disks, storage templates can also contain image entries. Like disk entries, they are identified by a sequential ID, as well as a source Ceph RBD pool and volume name. The specified volume may belong to a (shutdown) VM or be a dedicated template uploaded to the Ceph cluster.
|
||||
|
||||
###### Example: Creating a storage template with three mounted disks
|
||||
|
||||
* Note: You can also include the template name during creation.
|
||||
* Note: Disk sizes are always specified in GB.
|
||||
* Note: Filesystem arguments are passed as-is to the `mkfs` command and must use an `--opt=val` format to prevent splitting.
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/storage/ext4-root-var-log
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/storage/ext4-root-var-log/disk?disk_id=sda\&disk_size=4\&filesystem=ext4\&mountpoint=/\&pool=vms\&filesystem_arg='-L=root'
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/storage/ext4-root-var-log/disk/sdb?disk_size=4\&filesystem=ext4\&mountpoint=/var\&pool=vms\&filesystem_arg='-L=var'
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/storage/ext4-root-var-log/disk/sdc -d "disk_size=4\&filesystem=ext4\&mountpoint=/var/log\&pool=vms\&filesystem_arg='-L=log'\&filesystem_arg='-m=1'"
|
||||
curl -X GET http://localhost:7370/api/v1/provisioner/template/storage/ext4-root-var-log
|
||||
```
|
||||
|
||||
#### Userdata Templates
|
||||
|
||||
Userdata templates contain cloud-init metadata that can be provided to VMs on their first boot. It is accessible via an EC2-compatible API running on the PVC cluster to VMs. A userdata template contains the full text of the userdata, including optional multi-part sections if desired.
|
||||
|
||||
A default userdata template called "empty" is created by default, and this can be used for any profile which does not require cloud-init userdata, since a template must always be specified.
|
||||
|
||||
Examples of userdata templates can be found in `/usr/share/pvc/provisioner/examples` when the API is installed.
|
||||
|
||||
###### Example: Creating a userdata template from the `userdata.yaml` example file
|
||||
|
||||
* Note: For the block text commands (userdata and scripts), using the HTTP POST body for the data is always better than a URL argument.
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/template/userdata?name=example-userdata -d "data=$( cat /usr/share/pvc/provisioner/examples/userdata.yaml )"
|
||||
curl -X GET http://localhost:7370/api/v1/provisioner/template/userdata?name=example-userdata
|
||||
```
|
||||
|
||||
### Scripts
|
||||
|
||||
Scripts automate the installation of VMs with Python. To make use of a script, at least one disk volume must be both formatted with a Linux-compatible filesyste, and have a mountpoint (very likely `/`) configured. The specified disk is then mounted in a temporary directory on the active coordinator, and the script run against it. This script can then do any task required to set up and configure the VM, such as installing a Debian or Ubuntu system with debootstrap, obtaining a chroot and configuring GRUB, or almost any other task that the administrator may wish. All scripts are written in Python 3, which is then integrated into the provisioner's worker during VM creation and executed at the appropriate point.
|
||||
|
||||
Each script must contain a function called `install()` which accepts `**kwargs` and no other arguments. A number of default arguments are provided, including `vm_name`, the `temporary_directory`, and dictionaries of the `disks` and `networks`. Additional arguments can be specified in VM profiles to facilitate advanced configurations specific to particular VM types.
|
||||
|
||||
Examples of scripts can be found in `/usr/share/pvc/provisioner/examples` when the API is installed.
|
||||
|
||||
###### Example: Creating a script from the `debootstrap_script.py` example file
|
||||
|
||||
* Note: For the block text commands (userdata and scripts), using the HTTP POST body for the data is always better than a URL argument.
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/script/debootstrap-example -d "data=$( cat /usr/share/pvc/provisioner/examples/userdata.yaml )"
|
||||
curl -X GET http://localhost:7370/api/v1/provisioner/script/debootstrap-example
|
||||
```
|
||||
|
||||
### Profiles
|
||||
|
||||
Profiles group together the four template types and scripts, as well as optional script arguments, into a named profile which can be assigned to VMs on creation. When creating a VM, templates and scripts themselves are not explicitly specified; rather a profile is specified which then maps to these other values. This allows maximum flexibility, allowing a VM profile to combine the various templates and scripts in an arbitrary way. One potential usecase is to create a profile for a particular VM role, for instance a webserver, which will have a specific system, disk, network, and userdata configuration; multiple VMs can then be created with this profile to ensure they all contain the same resources and configuration.
|
||||
|
||||
###### Example: Creating a profile with the previously-created templates and some script arguments
|
||||
|
||||
* Note: Script arguments are specified as `name=value` pairs after the `arg=` argument.
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/profile/test-profile?system_template=2cpu-1gb-serial\&network_template=net200+net300\&disk_template=ext4-root-var-log\&userdata_template=example-userdata\&script=debootstrap-example\&arg=deb_release=buster\&arg=deb_mirror=http://deb.debian.org/debian\&arg=deb_packages=linux-image-amd64,grub-pc,cloud-init,python3-cffi-backend,wget
|
||||
curl -X GET http://localhost:7370/api/v1/provisioner/profile/test-profile
|
||||
```
|
||||
|
||||
### Creating VMs
|
||||
|
||||
VMs are created by specifying a name and a profile value. The provisioner API will then collect the details of the profile, and trigger the Celery worker (`pvc-provisioner-worker.service`) to begin creating the VM. The administrator can, at any point, obtain the status of the process via the Task ID, which is returned in the JSON body of the creation command. Once completed, by default, the resulting VM will be defined and started on the cluster, ready to use. If the VM uses cloud-init, it will then hit the Metadata API on startup to obtain the details of the VM as well as the userdata specified in the profile.
|
||||
|
||||
Additional options can also be specified at install time. Automatic definition of the VM and automatic startup of the VM can both be disabled via options to the creation command. The former is most useful when creating disk images from an installed set of VM disks, and the latter provides flexibility for the administrator to edit or review the final VM before starting it for the first time.
|
||||
|
||||
###### Example: Creating a VM and viewing its status
|
||||
|
||||
```
|
||||
curl -X POST http://localhost:7370/api/v1/provisioner/create?name=test1\&profile=test-profile
|
||||
curl -X GET http://localhost:7370/api/v1/provisioner/status/<task-id>
|
||||
```
|
||||
|
||||
## API Daemon Configuration
|
||||
|
||||
The API is configured using a YAML configuration file which is passed in to the API process by the environment variable `PVC_CONFIG_FILE`. When running with the default package and SystemD unit, this file is located at `/etc/pvc/pvcapid.yaml`.
|
||||
|
||||
### Conventions
|
||||
|
||||
* Settings may be `required`, `optional`, or `ignored`.
|
||||
|
||||
* Settings may `depends` on other settings. This indicates that, if one setting is enabled, the other setting is very likely `required` by that setting.
|
||||
|
||||
### `pvcapid.yaml`
|
||||
|
||||
Example configuration:
|
||||
|
||||
```
|
||||
---
|
||||
pvc:
|
||||
debug: True
|
||||
coordinators:
|
||||
- pvchv1
|
||||
- pvchv2
|
||||
- pvchv3
|
||||
api:
|
||||
listen_address: "127.0.0.1"
|
||||
listen_port: "7370"
|
||||
authentication:
|
||||
enabled: False
|
||||
secret_key: "aSuperLong&SecurePasswordString"
|
||||
tokens:
|
||||
- description: "testing"
|
||||
token: ""
|
||||
ssl:
|
||||
enabled: False
|
||||
cert_file: ""
|
||||
key_file: ""
|
||||
provisioner:
|
||||
database:
|
||||
host: 10.100.0.252
|
||||
port: 5432
|
||||
name: pvcapi
|
||||
user: pvcapi
|
||||
pass: pvcapi
|
||||
queue:
|
||||
host: localhost
|
||||
port: 6379
|
||||
path: /0
|
||||
ceph_cluster:
|
||||
storage_hosts:
|
||||
- pvchv1
|
||||
- pvchv2
|
||||
- pvchv3
|
||||
storage_domain: "s.bonilan.net"
|
||||
ceph_monitor_port: 6789
|
||||
ceph_storage_secret_uuid: "c416032b-2ce9-457f-a5c2-18704a3485f4"
|
||||
```
|
||||
|
||||
#### `debug`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable Debug mode or not. If enabled, the API will use the Flask debug runtime instead of the PyWSGI framework and will log additional output. Should not be enabled in production.
|
||||
|
||||
#### `coordinators`
|
||||
|
||||
* *required*
|
||||
|
||||
A list of coordinator hosts, used to generate the Zookeeper connection string.
|
||||
|
||||
#### `api` → `listen_address`
|
||||
|
||||
* *required*
|
||||
|
||||
The IP address for the API to listen on. Use `0.0.0.0` to specify "all interfaces".
|
||||
|
||||
#### `api` → `listen_port`
|
||||
|
||||
The port for the API to listen on.
|
||||
|
||||
#### `api` → `authentication` → `enabled`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable API authentication or not. Should usually be enabled in production deployments, especially if the API is available on untrusted networks.
|
||||
|
||||
#### `api` → `authentication` → `secret_key`
|
||||
|
||||
* *optional*
|
||||
* *requires* `authentication` → `enabled`
|
||||
|
||||
The Flask authentication secret key used to salt session credentials. Should be a long (>32-character) random string generated with `pwgen` or a similar tool.
|
||||
|
||||
#### `api` → `authentication` → `tokens`
|
||||
|
||||
* *optional*
|
||||
* *requires* `authentication` → `enabled`
|
||||
|
||||
A list of API authentication tokens that can be passed via the `X-Api-Key` header to authorize access to the API. Each list element contains the following fields:
|
||||
|
||||
##### `description`
|
||||
|
||||
* *ignored*
|
||||
|
||||
A text description of the token function or use. Not parsed by the API, but used for administrator reference in the configuration file.
|
||||
|
||||
##### `token`
|
||||
|
||||
* *required*
|
||||
|
||||
The token itself, usually a UUID created with `uuidegen` or a similar tool.
|
||||
|
||||
#### `api` → `ssl` → `enabled`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable SSL for the API or not. Should usually be enabled in production deployments, especially if the API is available on untrusted networks.
|
||||
|
||||
#### `api` → `ssl` → `cert_file`
|
||||
|
||||
The path to the SSL certificate file for the API to use.
|
||||
|
||||
#### `api` → `ssl` → `key_file`
|
||||
|
||||
The path to the SSL private key file for the API to use.
|
||||
|
||||
##### `provisioner` → `database` → `host`
|
||||
|
||||
* *required*
|
||||
|
||||
The hostname of the PostgreSQL instance for the Provisioner database. Should always be `localhost` except in advanced deployment scenarios.
|
||||
|
||||
##### `provisioner` → `database` → `port`
|
||||
|
||||
* *required*
|
||||
|
||||
The port of the PostgreSQL instance for the Provisioner database. Should always be `5432`.
|
||||
|
||||
##### `provisioner` → `database` → `name`
|
||||
|
||||
* *required*
|
||||
|
||||
The database name for the Provisioner database. Should always be `pvcapi`.
|
||||
|
||||
##### `provisioner` → `database` → `user`
|
||||
|
||||
* *required*
|
||||
|
||||
The username for the PVC API client to access the Provisioner database.
|
||||
|
||||
##### `provisioner` → `database` → `pass`
|
||||
|
||||
* *required*
|
||||
|
||||
The password for the PVC API client to access the Provisioner database.
|
||||
|
||||
#### `provisioner` → `queue` → `host`
|
||||
|
||||
* *required*
|
||||
|
||||
The hostname of the Redis instance for the Provisioner queue. Should always be `localhost` except in advanced deployment scenarios.
|
||||
|
||||
#### `provisioner` → `queue` → `port`
|
||||
|
||||
* *required*
|
||||
|
||||
The port of the Redis innstance for the Provisioner queue. Should always be `6379`.
|
||||
|
||||
#### `provisioner` → `queue` → `path`
|
||||
|
||||
* *required*
|
||||
|
||||
The Redis path for the Provisioner queue. Should always be `/0`.
|
||||
|
||||
#### `provisioner` → `ceph_cluster` → `storage_hosts`
|
||||
|
||||
* *required*
|
||||
|
||||
A list of hosts which run the Ceph monitors for VM disks. Should usually be identical to the list of `coordinators` except in advanced deployments.
|
||||
|
||||
#### `provisioner` → `ceph_cluster` → `storage_domain`
|
||||
|
||||
* *required*
|
||||
|
||||
The storage domain of the cluster, used with the `storage_hosts` entires to form FQDNs for the Ceph monitors. Should usually be identical to the cluster `storage_domain` except in advanced deployments.
|
||||
|
||||
#### `provisioner` → `ceph_cluster` → `ceph_monitor_port`
|
||||
|
||||
* *required*
|
||||
|
||||
The Ceph monitor port. Should always be `6789`.
|
||||
|
||||
#### `provisioner` → `ceph_cluster` → `ceph_storage_secret_uuid`
|
||||
|
||||
* *required*
|
||||
|
||||
The Libvirt storage secret UUID for the Ceph cluster.
|
||||
|
||||
## API Endpoint Documentation
|
||||
|
||||
The full API endpoint and schema documentation [can be found here](/manuals/api-reference.html).
|
@ -1,19 +0,0 @@
|
||||
# PVC CLI architecture
|
||||
|
||||
The PVC CLI is a standalone client application for PVC. It interfaces with the PVC API, via a configurable list of clusters with customizable hosts, ports, addresses, and authentication.
|
||||
|
||||
The CLI is build using Click and is packaged in the Debian package `pvc-client-cli`. The CLI does not depend on any other PVC components and can be used independently on arbitrary systems.
|
||||
|
||||
The CLI is self-documenting, however [the manual](/manuals/cli) details the required configuration.
|
||||
|
||||
# PVC CLI client manual
|
||||
|
||||
The PVC CLI client is built with Click, a Python framework for creating self-documenting CLI applications. It interfaces with the PVC API.
|
||||
|
||||
Use the `-h` option at any level of the `pvc` CLI command to receive help about the available commands and options.
|
||||
|
||||
Before using the CLI on a non-PVC node system, at least one cluster must be added using the `pvc cluster` subcommands. Running the CLI on hosts which also run the PVC API (via its configuration at `/etc/pvc/pvcapid.yaml`) uses the special `local` cluster, reading information from the API configuration, by default.
|
||||
|
||||
## Configuration
|
||||
|
||||
The CLI client requires no configuration file. The only optional external environment variable is `PVC_CLUSTER`, which can be used to specify a cluster to connect to.
|
@ -1,501 +0,0 @@
|
||||
# PVC Node Daemon architecture
|
||||
|
||||
The PVC Node Daemon is the heart of the PVC system and runs on each node to manage the state of the node and its configured resources. The daemon connects directly to the Zookeeper cluster for coordination and state.
|
||||
|
||||
The node daemon is build using Python 3.X and is packaged in the Debian package `pvc-daemon`.
|
||||
|
||||
Configuration of the daemon is documented in [the manual](/manuals/daemon), however it is recommended to use the [Ansible configuration system](https://github.com/parallelvirtualcluster/pvc-ansible) to configure the PVC cluster for you from scratch.
|
||||
|
||||
## Overall architecture
|
||||
|
||||
The PVC daemon is object-oriented - each cluster resource is represented by an Object, which is then present on each node in the cluster. This allows state changes to be reflected across the entire cluster should their data change.
|
||||
|
||||
During startup, the system scans the Zookeeper database and sets up the required objects. The database is then watched in real-time for additional changes to the database information.
|
||||
|
||||
## Startup sequence
|
||||
|
||||
The daemon startup sequence is documented below. The main daemon entry-point is `Daemon.py` inside the `pvcnoded` folder, which is called from the `pvcnoded.py` stub file.
|
||||
|
||||
0. The configuration is read from `/etc/pvc/pvcnoded.yaml` and the configuration object set up.
|
||||
|
||||
0. Any required filesystem directories, mostly dynamic directories, are created.
|
||||
|
||||
0. The logger is set up. If file logging is enabled, this is the state when the first log messages are written.
|
||||
|
||||
0. Host networking is configured based on the `pvcnoded.yaml` configuration file. In a normal cluster, this is the point where the node will become reachable on the network as all networking is handled by the PVC node daemon.
|
||||
|
||||
0. Sysctl tweaks are applied to the host system, to enable routing/forwarding between nodes via the host.
|
||||
|
||||
0. The node determines its coordinator state and starts the required daemons if applicable. In a normal cluster, this is the point where the dependent services such as Zookeeper, FRR, and Ceph become available. After this step, the daemon waits 5 seconds before proceeding to give these daemons a chance to start up.
|
||||
|
||||
0. The daemon connects to the Zookeeper cluster and starts its listener. If the Zookeeper cluster is unavailable, it will wait some time before abandoning the attempt and starting again from step 1.
|
||||
|
||||
0. Termination handling/cleanup is configured.
|
||||
|
||||
0. The node checks if it is already present in the Zookeeper cluster; if not, it will add itself to the database. Initial static options are also updated in the database here. The daemon state transitions from `stop` to `init`.
|
||||
|
||||
0. The node checks if Libvirt is accessible.
|
||||
|
||||
0. The node starts up the NFT firewall if applicable and configures the base rule-set.
|
||||
|
||||
0. The node ensures that `dnsmasq` is stopped (legacy check, might be safe to remove eventually).
|
||||
|
||||
0. The node begins setting up the object representations of resources, in order:
|
||||
|
||||
a. Node entries
|
||||
|
||||
b. Network entries, creating client networks and starting them as required.
|
||||
|
||||
c. Domain (VM) entries, starting up the VMs as required.
|
||||
|
||||
d. Ceph storage entries (OSDs, Pools, Volumes, Snapshots).
|
||||
|
||||
0. The node activates its keepalived timer and begins sending keepalive updates to the cluster. The daemon state transitions from `init` to `run` and the system has started fully.
|
||||
|
||||
# PVC Node Daemon manual
|
||||
|
||||
The PVC node daemon ins build with Python 3 and is run directly on nodes. For details of the startup sequence and general layout, see the [architecture document](/architecture/daemon).
|
||||
|
||||
## Configuration
|
||||
|
||||
The Daemon is configured using a YAML configuration file which is passed in to the API process by the environment variable `PVCD_CONFIG_FILE`. When running with the default package and SystemD unit, this file is located at `/etc/pvc/pvcnoded.yaml`.
|
||||
|
||||
For most deployments, the management of the configuration file is handled entirely by the [PVC Ansible framework](https://github.com/parallelvirtualcluster/pvc-ansible) and should not be modified directly. Many options from the Ansible framework map directly into the configuration options in this file.
|
||||
|
||||
### Conventions
|
||||
|
||||
* Settings may be `required`, `optional`, or `ignored`.
|
||||
|
||||
* Settings may `depends` on other settings. This indicates that, if one setting is enabled, the other setting is very likely `required` by that setting.
|
||||
|
||||
### `pvcnoded.yaml`
|
||||
|
||||
Example configuration:
|
||||
|
||||
```
|
||||
pvc:
|
||||
node: pvchv1
|
||||
debug: False
|
||||
functions:
|
||||
enable_hypervisor: True
|
||||
enable_networking: True
|
||||
enable_storage: True
|
||||
enable_api: True
|
||||
cluster:
|
||||
coordinators:
|
||||
- pvchv1
|
||||
- pvchv2
|
||||
- pvchv3
|
||||
networks:
|
||||
upstream:
|
||||
domain: "mydomain.net"
|
||||
network: "1.1.1.0/24"
|
||||
floating_ip: "1.1.1.10/24"
|
||||
gateway: "1.1.1.1"
|
||||
cluster:
|
||||
domain: "pvc.local"
|
||||
network: "10.255.0.0/24"
|
||||
floating_ip: "10.255.0.254/24"
|
||||
storage:
|
||||
domain: "pvc.storage"
|
||||
network: "10.254.0.0/24"
|
||||
floating_ip: "10.254.0.254/24"
|
||||
coordinator:
|
||||
dns:
|
||||
database:
|
||||
host: localhost
|
||||
port: 5432
|
||||
name: pvcdns
|
||||
user: pvcdns
|
||||
pass: pvcdnsPassw0rd
|
||||
metadata:
|
||||
database:
|
||||
host: localhost
|
||||
port: 5432
|
||||
name: pvcapi
|
||||
user: pvcapi
|
||||
pass: pvcapiPassw0rd
|
||||
system:
|
||||
fencing:
|
||||
intervals:
|
||||
keepalive_interval: 5
|
||||
fence_intervals: 6
|
||||
suicide_intervals: 0
|
||||
actions:
|
||||
successful_fence: migrate
|
||||
failed_fence: None
|
||||
ipmi:
|
||||
host: pvchv1-lom
|
||||
user: admin
|
||||
pass: Passw0rd
|
||||
migration:
|
||||
target_selector: mem
|
||||
configuration:
|
||||
directories:
|
||||
dynamic_directory: "/run/pvc"
|
||||
log_directory: "/var/log/pvc"
|
||||
console_log_directory: "/var/log/libvirt"
|
||||
logging:
|
||||
file_logging: True
|
||||
stdout_logging: True
|
||||
log_colours: True
|
||||
log_dates: True
|
||||
log_keepalives: True
|
||||
log_keepalive_cluster_details: True
|
||||
log_keepalive_storage_details: True
|
||||
console_log_lines: 1000
|
||||
networking:
|
||||
bridge_device: ens4
|
||||
bridge_mtu: 1500
|
||||
sriov_enable: True
|
||||
sriov_device:
|
||||
- phy: ens1f0
|
||||
mtu: 9000
|
||||
vfcount: 7
|
||||
upstream:
|
||||
device: ens4
|
||||
mtu: 1500
|
||||
address: None
|
||||
cluster:
|
||||
device: ens4
|
||||
mtu: 1500
|
||||
address: by-id
|
||||
storage:
|
||||
device: ens4
|
||||
mtu: 1500
|
||||
address: by-id
|
||||
```
|
||||
|
||||
#### `node`
|
||||
|
||||
* *required*
|
||||
|
||||
The (short) hostname of the node; host-specific.
|
||||
|
||||
#### `debug`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable or disable debug mode. Debug mode enables additional logging of subtasks throughout the system.
|
||||
|
||||
#### `functions` → `enable_hypervisor`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable the hypervisor functionality of the PVC Daemon or not. This should usually be enabled except in advanced deployment scenarios (such as a dedicated quorum-keeping micro-node or dedicated network routing node).
|
||||
|
||||
#### `functions` → `enable_networking`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable the client network functionality of the PVC Daemon or not. This should usually be enabled except in deployment scenarios where networking is completely unmanaged by PVC.
|
||||
|
||||
#### `functions` → `enable_storage`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable the virtual storage functionality of the PVC Daemon or not. This should usually be enabled except in advanced deployment scenarios featuring unmanaged external storage.
|
||||
|
||||
#### `functions` → `enable_api`
|
||||
|
||||
Whether to enable the PVC API client on the cluster floating IPs or not.
|
||||
|
||||
#### `cluster` → `coordinators`
|
||||
|
||||
* *required*
|
||||
|
||||
A list of coordinator hosts, used to generate the Zookeeper connection string and determine if the current host is a coordinator or not
|
||||
.
|
||||
|
||||
#### `cluster` → `networks`
|
||||
|
||||
* *optional*
|
||||
* *requires* `functions` → `enable_networking`
|
||||
|
||||
Contains a dictionary of networks and their configurations for the PVC cluster. Optional only if `enable_networking` is `False`. The three required network types/names are `upstream`, `cluster`, and `storage`. Each network type contains the following entries.
|
||||
|
||||
##### `domain`
|
||||
|
||||
* *required*
|
||||
|
||||
The domain name for the network. Should be a valid domain name, or `None`. Specifically for the `upstream` network, this should match the domain portion of the node hostname.
|
||||
|
||||
##### `network`
|
||||
|
||||
The CIDR-formatted IPv4 address block for the network.
|
||||
|
||||
##### `floating_ip`
|
||||
|
||||
The CIDR-formatted IPv4 address for the floating IP within the network. This IP will belong exclusively to the `primary` coordinator node to provide a central entrypoint for functionality on the cluster.
|
||||
|
||||
##### `gateway`
|
||||
|
||||
The IPv4 address for the gateway of the network. Usually applicable only to the `upstream` network, as the other two are normally unrouted and local to the cluster.
|
||||
|
||||
#### `coordinator`
|
||||
|
||||
* *optional*
|
||||
* *requires* `functions` → `enable_networking`
|
||||
|
||||
Configuration for coordinator functions on the node. Optional only if `enable_networking` is `False`. Not optional on non-coordinator hosts, though unused. Contains the following sub-entries.
|
||||
|
||||
##### `dns` → `database` → `host`
|
||||
|
||||
* *required*
|
||||
|
||||
The hostname of the PostgreSQL instance for the DNS aggregator database. Should always be `localhost` except in advanced deployment scenarios.
|
||||
|
||||
##### `dns` → `database` → `port`
|
||||
|
||||
* *required*
|
||||
|
||||
The port of the PostgreSQL instance for the DNS aggregator database. Should always be `5432`.
|
||||
|
||||
##### `dns` → `database` → `name`
|
||||
|
||||
* *required*
|
||||
|
||||
The database name for the DNS aggregator database. Should always be `pvcdns`.
|
||||
|
||||
##### `dns` → `database` → `user`
|
||||
|
||||
* *required*
|
||||
|
||||
The username for the PVC node daemon to access the DNS aggregator database.
|
||||
|
||||
##### `dns` → `database` → `pass`
|
||||
|
||||
* *required*
|
||||
|
||||
The password for the PVC node daemon to access the DNS aggregator database.
|
||||
|
||||
##### `metadata` → `database` → `host`
|
||||
|
||||
* *required*
|
||||
|
||||
The hostname of the PostgreSQL instance for the Provisioner database. Should always be `localhost` except in advanced deployment scenarios.
|
||||
|
||||
##### `metadata` → `database` → `port`
|
||||
|
||||
* *required*
|
||||
|
||||
The port of the PostgreSQL instance for the Provisioner database. Should always be `5432`.
|
||||
|
||||
##### `metadata` → `database` → `name`
|
||||
|
||||
* *required*
|
||||
|
||||
The database name for the Provisioner database. Should always be `pvcapi`.
|
||||
|
||||
##### `metadata` → `database` → `user`
|
||||
|
||||
* *required*
|
||||
|
||||
The username for the PVC node daemon to access the Provisioner database.
|
||||
|
||||
##### `metadata` → `database` → `pass`
|
||||
|
||||
* *required*
|
||||
|
||||
The password for the PVC node daemon to access the Provisioner database.
|
||||
|
||||
#### `system` → `intervals` → `keepalive_interval`
|
||||
|
||||
* *required*
|
||||
|
||||
The number of seconds between keepalive messages to the cluster. The default is 5 seconds; for slow cluster nodes, 10-30 seconds may be more appropriate however this will result in slower responses to changes in the cluster and less accurate/up-to-date information in the clients.
|
||||
|
||||
#### `system` → `intervals` → `fence_intervals`
|
||||
|
||||
* *required*
|
||||
|
||||
The number of keepalive messages that can be missed before a node is considered dead and the fencing cycle triggered on it. The default is 6, or 30 seconds of inactivity with a 5 second `keepalive_interval`. Can be set to 0 to disable fencing as the timeout will never trigger.
|
||||
|
||||
#### `system` → `intervals` → `suicide_intervals`
|
||||
|
||||
* *required*
|
||||
|
||||
The number of keepalive message that can be missed before a node considers itself dead and forcibly resets itself. Note that, due to the large number of reasons a node could become unresponsive, the suicide interval alone should not be relied upon. The default is 0, which disables this functionality. If set, should usually be equal to or less than `fence_intervals` for maximum safety.
|
||||
|
||||
#### `system` → `fencing` → `actions` → `successful_fence`
|
||||
|
||||
* *required*
|
||||
|
||||
The action to take regarding VMs once a node is *successfully* fenced, i.e. the IPMI command to restart the node reports a success. Can be one of `migrate`, to migrate and start all failed VMs on other nodes and the default, or `None` to perform no action.
|
||||
|
||||
#### `system` → `fencing` → `actions` → `failed_fence`
|
||||
|
||||
* *required*
|
||||
|
||||
The action to take regarding VMs once a node fencing *fails*, i.e. the IPMI command to restart the node reports a failure. Can be one of `None`, to perform no action and the default, or `migrate` to migrate and start all failed VMs on other nodes.
|
||||
|
||||
**WARNING:** This functionality is potentially **dangerous** and can result in data loss or corruption in the VM disks; the post-fence migration process *explicitly clears RBD locks on the disk volumes*. It is designed only for specific and advanced use-cases, such as servers that do not reliably report IPMI responses or servers without IPMI (not recommended; see the [cluster architecture documentation](/architecture/cluster)). If this is set to `migrate`, the `suicide_intervals` **must** be set to provide at least some guarantee that the VMs on the node will actually be terminated before this condition triggers. The administrator should think very carefully about their setup and potential failure modes before enabling this option.
|
||||
|
||||
#### `system` → `fencing` → `ipmi` → `host`
|
||||
|
||||
* *required*
|
||||
|
||||
The hostname or IP address of this node's IPMI interface. Must be reachable from the nodes.
|
||||
|
||||
#### `system` → `fencing` → `ipmi` → `user`
|
||||
|
||||
* *required*
|
||||
|
||||
The username for the PVC node daemon to log in to the IPMI interface. Must have permission to reboot the host (command `ipmitool chassis power reset`).
|
||||
|
||||
#### `system` → `fencing` → `ipmi` → `pass`
|
||||
|
||||
* *required*
|
||||
|
||||
The password for the PVC node daemon to log in to the IPMI interface.
|
||||
|
||||
#### `system` → `migration` → `target_selector`
|
||||
|
||||
* *required*
|
||||
|
||||
The default selector algorithm to use when migrating VMs away from a node; individual VMs can override this default.
|
||||
|
||||
Valid `target_selector` values are:
|
||||
* `mem`: choose the node with the least provisioned VM memory
|
||||
* `memfree`: choose the node with the most (real) free memory
|
||||
* `vcpus`: choose the node with the least allocated VM vCPUs
|
||||
* `load`: choose the node with the lowest current load average
|
||||
* `vms`: choose the node with the least number of provisioned VMs
|
||||
|
||||
For most clusters, `mem` should be sufficient, but others may be used based on the cluster workload and available resources. The following caveats should be considered:
|
||||
* `mem` looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use.
|
||||
* `memfree` looks at the free memory of the node in general, ignoring the amount provisioned to VMs; if any VM's internal memory usage changes, this value would be affected. This might be preferable to `mem` on clusters with very high memory utilization versus total capacity or if many VMs are stopped/disabled.
|
||||
* `load` looks at the system load of the node in general, ignoring load in any particular VMs; if any VM's CPU usage changes, this value would be affected. This might be preferable on clusters with some very CPU intensive VMs.
|
||||
|
||||
#### `system` → `configuration` → `directories` → `dynamic_directory`
|
||||
|
||||
* *required*
|
||||
|
||||
The directory to store ephemeral configuration files. Usually `/run/pvc` or a similar temporary directory.
|
||||
|
||||
#### `system` → `configuration` → `directories` → `log_directory`
|
||||
|
||||
* *required*
|
||||
|
||||
The directory to store log files for `file_logging`. Usually `/var/log/pvc` or a similar directory. Must be specified even if `file_logging` is `False`, though ignored.
|
||||
|
||||
#### `system` → `configuration` → `directories` → `console_log_directory`
|
||||
|
||||
* *required*
|
||||
|
||||
The directory to store VM console logs. Usually `/var/log/libvirt` or a similar directory.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `file_logging`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable direct logging to a file in `log_directory` or not.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `stdout_logging`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to enable logging to stdout or not; captured by SystemD and JournalD by default.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `log_colours`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to log ANSI colour sequences in the log output or not.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `log_dates`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to log the current date and time in the log output or not.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `log_keepalives`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to log keepalive messages or not.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `log_keepalive_cluster_details`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to log node status information during keepalives or not.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `log_keepalive_storage_details`
|
||||
|
||||
* *required*
|
||||
|
||||
Whether to log storage cluster status information during keepalives or not.
|
||||
|
||||
#### `system` → `configuration` → `logging` → `console_log_lines`
|
||||
|
||||
* *required*
|
||||
|
||||
How many lines of VM console logs to keep in the Zookeeper database for each VM.
|
||||
|
||||
#### `system` → `configuration` → `networking` → `bridge_device`
|
||||
|
||||
* *optional*
|
||||
* *requires* `functions` → `enable_networking`
|
||||
|
||||
The network interface device used to create Bridged client network vLANs on. For most clusters, should match the underlying device of the various static networks (e.g. `ens4` or `bond0`), though may also use a separate network interface.
|
||||
|
||||
#### `system` → `configuration` → `networking` → `bridge_mtu`
|
||||
|
||||
* *optional*
|
||||
* *requires* `functions` → `enable_networking`
|
||||
|
||||
The network interface MTU for the Bridged client network device. This is the maximum MTU a bridged client network can use.
|
||||
|
||||
#### `system` → `configuration` → `networking` → `sriov_enable`
|
||||
|
||||
* *optional*, defaults to `False`
|
||||
* *requires* `functions` → `enable_networking`
|
||||
|
||||
Enables (or disables) SR-IOV functionality in PVC. If enabled, at least one `sriov_device` entry should be specified.
|
||||
|
||||
#### `system` → `configuration` → `networking` → `sriov_device`
|
||||
|
||||
* *optional*
|
||||
* *requires* `functions` → `enable_networking`
|
||||
|
||||
Contains a list of SR-IOV PF (physical function) devices and their basic configuration. Each element contains the following entries:
|
||||
|
||||
##### `phy`:
|
||||
|
||||
* *required*
|
||||
|
||||
The raw Linux network device with SR-IOV PF functionality.
|
||||
|
||||
##### `mtu`
|
||||
|
||||
The MTU of the PF device, set on daemon startup.
|
||||
|
||||
##### `vfcount`
|
||||
|
||||
The number of VF devices to create on this PF. VF devices are then managed via PVC on a per-node basis.
|
||||
|
||||
#### `system` → `configuration` → `networking`
|
||||
|
||||
* *optional*
|
||||
* *requires* `functions` → `enable_networking`
|
||||
|
||||
Contains a dictionary of networks and their configurations on this node. Optional only if `enable_networking` is `False`. The three required network types/names are `upstream`, `cluster`, and `storage`. Each network type contains the following entries.
|
||||
|
||||
##### `device`
|
||||
|
||||
* *required*
|
||||
|
||||
The raw Linux network device that the network exists on.
|
||||
|
||||
##### `mtu`
|
||||
|
||||
* *required*
|
||||
|
||||
The MTU of the network device.
|
||||
|
||||
##### `address`
|
||||
|
||||
* *required*
|
||||
|
||||
The IPv4 address of the interface. Can be one of: `None`, for no IP address; `by-id`, to automatically select an address in the relevant `networks` section via the host ID (e.g. node1 will get `.1`, node2 will get `.2`, etc.); or a static CIDR-formatted IP address.
|
@ -1,441 +0,0 @@
|
||||
# PVC Provisioner Manual
|
||||
|
||||
The PVC provisioner is a subsection of the main PVC API. It interfaces directly with the Zookeeper database using the common client functions, and with the Patroni PostgreSQL database to store details. The provisioner also interfaces directly with the Ceph storage cluster, for mapping volumes, creating filesystems, and installing guests.
|
||||
|
||||
Details of the Provisioner API interface can be found in [the API manual](/manuals/api).
|
||||
|
||||
- [PVC Provisioner Manual](#pvc-provisioner-manual)
|
||||
* [Overview](#overview)
|
||||
* [PVC Provisioner concepts](#pvc-provisioner-concepts)
|
||||
+ [Templates](#templates)
|
||||
+ [Userdata](#cloud-init-userdata)
|
||||
+ [Scripts](#provisioning-scripts)
|
||||
+ [Profiles](#profiles)
|
||||
* [Deploying VMs from provisioner scripts](#deploying-vms-from-provisioner-scripts)
|
||||
* [Deploying VMs from OVA images](#deploying-vms-from-ova-images)
|
||||
+ [Uploading an OVA](#uploading-an-ova)
|
||||
+ [The OVA Provisioning Script](#the-ova-provisioning-script)
|
||||
+ [OVA limitations](#ova-limitations)
|
||||
|
||||
## Overview
|
||||
|
||||
The purpose of the Provisioner API is to provide a convenient way for administrators to automate the creation of new virtual machines on the PVC cluster.
|
||||
|
||||
The Provisioner allows the administrator to construct descriptions of VMs, called profiles, which include system resource specifications, network interfaces, disks, cloud-init userdata, and installation scripts. These profiles are highly modular, allowing the administrator to specify arbitrary combinations of the mentioned VM features with which to build new VMs.
|
||||
|
||||
The provisioner supports creating VMs based off of installation scripts, by cloning existing volumes, and by uploading OVA image templates to the cluster.
|
||||
|
||||
Examples in the following sections use the CLI exclusively for demonstration purposes. For details of the underlying API calls, please see the [API interface reference](/manuals/api-reference.html).
|
||||
|
||||
Use of the PVC Provisioner is not required. Administrators can always perform their own installation tasks, and the provisioner is not specially integrated, calling various other API commands as though they were run from the CLI or API.
|
||||
|
||||
# PVC Provisioner concepts
|
||||
|
||||
Before explaining how to create VMs using either OVA images or installer scripts, we must discuss the concepts used to construct the PVC provisioner system.
|
||||
|
||||
## Templates
|
||||
|
||||
Templates are the building blocks of VMs. Each template type specifies part of the configuration of a VM, and when combined together later into profiles, provide a full description of the VM resources.
|
||||
|
||||
Templates are used to provide flexibility for the administrator. For instance, one could specify some standard core resources for different VMs, but then specify a different set of storage devices and networks for each one. This flexibility is at the heart of this system, allowing the administrator to construct a complex set of VM configurations from a few basic templates.
|
||||
|
||||
The PVC Provisioner features three types of templates: System Templates, Network Templates, and Disk Templates.
|
||||
|
||||
### System Templates
|
||||
|
||||
System templates specify the basic resources of the virtual machine: vCPUs, memory, serial/VNC consoles, and PVC configuration metadata (migration methods, node limits, etc.). Each profile requires a single system template.
|
||||
|
||||
The simplest valid template will specify a number of vCPUs and an amount of vRAM; additional details are optional and can be specified if required.
|
||||
|
||||
Serial consoles are required to make use of the `pvc vm log` functionality, via console logfiles in `/var/log/libvirt` on the nodes. VMs without a serial console show an empty log. Note that the guest operating system must also be configured to provide output to this serial console for this functionality to work as expected.
|
||||
|
||||
VNC consoles permit graphical access to the VM. By default, the VNC interface listens only on 127.0.0.1 on its parent node; the VNC bind configuration can override this to listen on other interfaces, including `0.0.0.0` for all.
|
||||
|
||||
PVC does not currently support SPICE or any other non-VNC consoles.
|
||||
|
||||
#### Examples
|
||||
|
||||
```
|
||||
$ pvc provisioner template system list
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
System templates:
|
||||
|
||||
Name ID vCPUs vRAM [MB] Consoles: Serial VNC VNC bind Metadata: Limit Selector Autostart Migration
|
||||
ext-lg 80 4 8192 False False None None None False None
|
||||
ext-lg-ser 81 4 8192 True False None None None False None
|
||||
ext-lg-vnc 82 4 8192 False True 0.0.0.0 None None False None
|
||||
ext-sm-lim 83 1 1024 True False None pvchv1,pvchv2 mem True live
|
||||
```
|
||||
|
||||
* The first example specifies a template with 4 vCPUs and 8GB of RAM. It has no serial or VNC consoles, and no non-default metadata, forming the most basic possible system template.
|
||||
|
||||
* The second example specifies a template with the same vCPU and RAM quantities as the first, but with a serial console as well. VMs using this template will be able to make use of `pvc vm log` as long as their guest operating system is configured to use it.
|
||||
|
||||
* The third example specifies a template with an alternate console to the second, in this case a VNC console bound to `0.0.0.0` (all interfaces). VNC ports are always auto-selected due to the dynamic nature of PVC, and the administrator can connect to them once the VM is running by determining the port on the hosting hypervisor (e.g. with `netstat -tl`).
|
||||
|
||||
* The fourth example shows the ability to set PVC cluster metadata in a system template. VMs with this template will be forcibly limited to running on the hypervisors `pvchv1` and `pvchv2`, but no others, will explicitly use the `mem` (free memory) selector when choosing migration or deployment targets, will be set to automatically start on reboot of its hypervisor, and will be limited to live migration between nodes. For full details on what these options mean, see `pvc vm meta -h`.
|
||||
|
||||
### Network Templates
|
||||
|
||||
Network template specify which PVC networks the virtual machine will be bound to, as well as the method used to calculate MAC addresses for VM interfaces. Networks are specified by their VNI ID within PVC.
|
||||
|
||||
A network template requires at least one network VNI to be valid, and is created in two stages. First, `pvc provisioner template network add` adds the template itself, along with the optional MAC template. Second, `pvc provisioner template network vni add` adds a VNI into the network template. VNIs are always shown and created in the order added; to move networks around they must be removed then re-added in the proper order; this will not affect existing VMs provisioned with the template.
|
||||
|
||||
In some cases, it may be useful for the administrator to specify a static MAC address pattern for a set of VMs, for instance if they must get consistent DHCP reservations between rebuilds. Such a MAC address template can be specified when adding a new network template, using a standardized layout and set of interpolated variables. This is an optional feature; if no MAC template is specified, VMs will be configured with random MAC addresses for each interface at deploy time.
|
||||
|
||||
#### Examples
|
||||
|
||||
```
|
||||
$ pvc provisioner template network list
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Network templates:
|
||||
|
||||
Name ID MAC template Network VNIs
|
||||
ext-101 80 None 101
|
||||
ext-11X 81 None 110,1101
|
||||
fixed-mac 82 {prefix}:ff:ff:{vmid}{netid} 1000,1001,1002
|
||||
```
|
||||
|
||||
* The first example shows a simple single-VNI network with no MAC template.
|
||||
|
||||
* The second example shows a dual-VNI network with no MAC template. Note the ordering; as mentioned, the first VNI will be provisioned on `eth0`, the second VNI on `eth1`, etc.
|
||||
|
||||
* The third example shows a triple-VNI network with a MAC template. The variable names shown are literal, while the `f` values are user-configurable and must be set to valid hexadecimal values by the administrator to uniquely identify the MAC address (in this case, using `ff:ff` for that segment). The variables are interpolated at deploy time as follows:
|
||||
|
||||
* The `{prefix}` variable is replaced by the provisioner with a standard prefix (`52:54:01`), which is different from the randomly-generated MAC prefix (`52:54:00`) to avoid accidental overlap of MAC addresses. These OUI prefixes are not assigned to any vendor by the IEEE and thus should not conflict with any (real, standards-compliant) devices on the network.
|
||||
|
||||
* The `{vmid}` variable is replaced by a single hexadecimal digit representing the VM's ID, the numerical suffix portion of its name (e.g. `myvm2` will have ID 2); VMs without a suffix numeral in their names have ID 0. VMs with IDs greater than 15 (hexadecimal `f`) will wrap back to 0, so a single MAC template should never be used by more than 16 VMs (numbered 0-15).
|
||||
|
||||
* The `{netid}` variable is replaced by a single hexadecimal digit representing the sequential identifier, starting at 0, of the interface within the template (i.e. the first interface is 0, the second is 1, etc.). Like the VM ID, network IDs greater than 15 (hexadecimal `f`) will wrap back to 0, so a single VM should never have more than 16 interfaces.
|
||||
|
||||
* The location of the two per-VM variables can be adjusted at the administrator's discretion, or removed if not required (e.g. a single-network template, or template for a single VM). In such situations, be careful to avoid accidental overlap with other templates' variable portions.
|
||||
|
||||
### Disk Templates
|
||||
|
||||
Disk templates specify the disk layout, including filesystem and mountpoint for scripted deployments, for the VM. Disks are specified by their virtual disk ID in Libvirt, in either `sdX` or `vdX` format, and sizes are always specified in GB. Disks may also reference other storage volumes, which will then be cloned during provisioning.
|
||||
|
||||
For additional flexibility, the volume filesystem and mountpoint are optional; such volumes will be created and attached to the VM but will not be modified during provisioning.
|
||||
|
||||
All storage volumes created by the provisioner at deploy time, regardless of source or type, will be named in the format `<vmname>_<id>`, for instance `myvm_sda`.
|
||||
|
||||
#### Examples
|
||||
|
||||
```
|
||||
$ pvc provisioner template storage list
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Storage templates:
|
||||
|
||||
Name ID Disk ID Pool Source Volume Size [GB] Filesystem Arguments Mountpoint
|
||||
standard-ext4 21
|
||||
sda vms None 2 ext4 -L=root /
|
||||
sdb vms None 4 ext4 -L=var /var
|
||||
sdc vms None 4 ext4 -L=log /var/log
|
||||
large-cloned 22
|
||||
sda vms template_sda None None None None
|
||||
sdb vms None 40 None None None
|
||||
```
|
||||
|
||||
* The first example shows a volume with a simple 3-disk layout suitable for most Linux distributions. Each volume is in pool `vms`, with an `ext4` filesystem, an argument specifying a disk label, and a mountpoint to which the volume will be mounted when deploying the VM. All 3 volumes will be created at deploy time. When deploying VMs using Scripts detailed below, this is the normal format that storage templates should take to ensure that all block devices are formatted and mounted in the proper place for the script to take over and install the operating system to them.
|
||||
|
||||
* The second example shows both a cloned volume and a blank volume. At deploy time, the Source Volume for the `sda` device will be cloned and attached to the VM at `sda`. The second volume will be created at deploy time, but will not be formatted or mounted, and will thus show as an empty block device inside the VM. This type of storage template is more suited to devices that do not use the Script install method, and are instead cloned from a source volume, either another running VM, or a manually-uploaded disk image.
|
||||
|
||||
* Unformatted block devices as shown in the second example can be used in any type of storage template, though care should be taken to consider their purpose; unformatted block devices are completely ignored by the Script at deploy time.
|
||||
|
||||
## Cloud-Init Userdata
|
||||
|
||||
PVC allows the sending of arbitrary cloud-init userdata to VMs on boot-up. It uses an Amazon AWS EC2-style metadata service, listening at the link-local IP `169.254.169.254` on port `80`, to delivery basic VM information and this userdata to the VMs. The metadata to be sent is based dynamically on the assigned profile of the VM at boot time.
|
||||
|
||||
Both single-function and multipart cloud-init userdata is supported. Full examples can be found under `/usr/share/pvc/provisioner/examples` on any PVC coordinator node.
|
||||
|
||||
The default userdata document "empty" can be used to skip userdata for a profile.
|
||||
|
||||
#### Examples
|
||||
|
||||
```
|
||||
$ pvc provisioner userdata list
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Name ID Document
|
||||
empty 10
|
||||
basic-ssh 11 Content-Type: text/cloud-config; charset="us-ascii"
|
||||
MIME-Version: 1.0
|
||||
|
||||
#cloud-config
|
||||
[...]
|
||||
```
|
||||
|
||||
* The first example is the default, always-present `empty` document, which is sent to invalid VMs if requested, or can be configured explicitly for profiles that do not require cloud-init userdata, instead of leaving that section of the profile as `None`.
|
||||
|
||||
* The second, truncated, example is the start of a normal single-function userdata document. For full details on the contents of these documents, see the cloud-init documentation.
|
||||
|
||||
## Provisioning Scripts
|
||||
|
||||
The PVC provisioner provides a scripting framework in order to automate VM installation. This is generally the most useful with UNIX-like systems which can be installed over the network via shell scripts. For instance, the script might install a Debian VM using `debootstrap`, which is automatically installed by default. However all deployment profiles require some provisioning script, minimally to craft their Libvirt configuration.
|
||||
|
||||
Several example scripts are provided in the `/usr/share/pvc/provisioner/examples/scripts` directory of all PVC hypervisors. These can be imported into the provisioner system as-is to help get you started, or you are of course free to modify or extend these as you wish, or write your own based on them to suit your needs.
|
||||
|
||||
Provisioner scripts are written in Python 3 and are implemented as a class, `VMBuilderScript`, which extends the built-in `VMBuilder` class, for example:
|
||||
|
||||
```python
|
||||
#!/usr/bin/env python3
|
||||
# I am an example provisioner script
|
||||
|
||||
from pvcapid.vmbuilder import VMBuilder
|
||||
|
||||
class VMBuilderScript(VMBuilder):
|
||||
def setup(self):
|
||||
...
|
||||
```
|
||||
|
||||
Each `VMBuilderScript` class instance should provide the 5 functions defined by the VMBuilder class (or they will be noops). All 5 functions should take no arguments except `self`; data is passed to them from the parent `VMBuilder` class as outlined below. Each function provides a specific part of the installation process to automate each step with maximum flexibility:
|
||||
|
||||
* `setup()`: Performs any special initial setup (e.g. fetching scripts or configs from the Internet) and validation of the environment (e.g. checking if particular binaries are available) before proceeding with the install.
|
||||
|
||||
* `create()`: Creates the VM libvirt XML definition based on the information provided by the VM profile and arguments. This is the only function that returns data (namely, the string representation of the XML config).
|
||||
|
||||
* `prepare()`: Creates and prepares any RBD storage volumes, filesystems, and mountpoints for the next step.
|
||||
|
||||
* `install()`: Performs any install steps required; note that the lines between `prepare()` and `install()` are fuzzy; the main point is that these are delineated in the sequence as discrete steps.
|
||||
|
||||
* `cleanup()`: Performs any "inner" cleanup of things done in the `prepare()` or `install()` steps (e.g. unmounting and unmapping RBD volumes, removing temporary files, etc.); also called on any *failure* of those steps.
|
||||
|
||||
Each step is described in more detail in the various examples, and those should be consulted to get a full understanding of how the steps work.
|
||||
|
||||
Note that no `__init__` should be provided by a script: doing so could result in failing scripts and should not be required.
|
||||
|
||||
As mentioned above, the `VMBuilderScript` instance includes several instance variables inherited from the parent `VMBuilder` definition. These consist of:
|
||||
|
||||
* `self.vm_name`: The name of the VM as provided to `pvc provisioner create`.
|
||||
|
||||
* `self.vm_id`: The numeral at the end of the `vm_name` (e.g. 2 for `web2`), or `0` if no numeral is present. Mostly useful when combined with network MAC address templates or preseeding clustered hosts.
|
||||
|
||||
* `self.vm_uuid`: An automatically, randomly-generated universal unique ID for the VM to use in its Libvirt XML definition (or elsewhere, if required).
|
||||
|
||||
* `self.vm_profile`: The name of the PVC provisioner profile used to create the VM. Mostly useful for VM descriptions.
|
||||
|
||||
* `self.vm_data`: A full dictionary representation of the data provided by the PVC provisioner about the VM. Includes many useful details for crafting the VM configuration and setting up disks and networks. An example, in JSON format:
|
||||
|
||||
```
|
||||
{
|
||||
"ceph_monitor_list": [
|
||||
"hv1.pvcstorage.tld",
|
||||
"hv2.pvcstorage.tld",
|
||||
"hv3.pvcstorage.tld"
|
||||
],
|
||||
"ceph_monitor_port": "6789",
|
||||
"ceph_monitor_secret": "96721723-8650-4a72-b8f6-a93cd1a20f0c",
|
||||
"mac_template": null,
|
||||
"networks": [
|
||||
{
|
||||
"eth_bridge": "vmbr1001",
|
||||
"id": 72,
|
||||
"network_template": 69,
|
||||
"vni": "1001"
|
||||
},
|
||||
{
|
||||
"eth_bridge": "vmbr101",
|
||||
"id": 73,
|
||||
"network_template": 69,
|
||||
"vni": "101"
|
||||
}
|
||||
],
|
||||
"script": [contents of this file]
|
||||
"script_arguments": {
|
||||
"deb_mirror": "http://ftp.debian.org/debian",
|
||||
"deb_release": "bullseye"
|
||||
},
|
||||
"system_architecture": "x86_64",
|
||||
"system_details": {
|
||||
"id": 78,
|
||||
"migration_method": "live",
|
||||
"name": "small",
|
||||
"node_autostart": false,
|
||||
"node_limit": null,
|
||||
"node_selector": null,
|
||||
"ova": null,
|
||||
"serial": true,
|
||||
"vcpu_count": 2,
|
||||
"vnc": false,
|
||||
"vnc_bind": null,
|
||||
"vram_mb": 2048
|
||||
},
|
||||
"volumes": [
|
||||
{
|
||||
"disk_id": "sda",
|
||||
"disk_size_gb": 4,
|
||||
"filesystem": "ext4",
|
||||
"filesystem_args": "-L=root",
|
||||
"id": 9,
|
||||
"mountpoint": "/",
|
||||
"pool": "vms",
|
||||
"source_volume": null,
|
||||
"storage_template": 67
|
||||
},
|
||||
{
|
||||
"disk_id": "sdb",
|
||||
"disk_size_gb": 4,
|
||||
"filesystem": "ext4",
|
||||
"filesystem_args": "-L=var",
|
||||
"id": 10,
|
||||
"mountpoint": "/var",
|
||||
"pool": "vms",
|
||||
"source_volume": null,
|
||||
"storage_template": 67
|
||||
},
|
||||
{
|
||||
"disk_id": "sdc",
|
||||
"disk_size_gb": 4,
|
||||
"filesystem": "ext4",
|
||||
"filesystem_args": "-L=log",
|
||||
"id": 11,
|
||||
"mountpoint": "/var/log",
|
||||
"pool": "vms",
|
||||
"source_volume": null,
|
||||
"storage_template": 67
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Since the `VMBuilderScript` runs within its own context but within the PVC Provisioner/API system, it is possible to use many helper libraries from the PVC system itself, including both the built-in daemon libraries (used by the API itself) and several explicit provisioning script helpers. The following are commonly-used (in the examples) imports that can be leveraged:
|
||||
|
||||
* `pvcapid.vmbuilder.VMBuilder`: Required, provides the parent class for the `VMBuilderScript` class.
|
||||
* `pvcapid.vmbuilder.ProvisioningError`: An exception that should be used within the `VMBuilderScript` to raise exceptions (though you can of course raise any other exception you wish or define your own).
|
||||
* `pvcapid.vmbuilder.open_zk`: A context manager that can be used to open a Zookeeper connection, providing a `zkhandler` that can be passed to other PVC daemon library functions below.
|
||||
* `pvcapid.vmbuilder.chroot`: A context manager that can be used to easily `chroot` into a given directory.
|
||||
* `pvcapid.Daemon.config`: A configuration variable that *must* be passed to `open_zk` if it is used.
|
||||
* `pvcapid.libvirt_schema`: A library providing a number of helpful Libvirt XML snippits that can be used to aid in building a working VM config for PVC. See the examples for a full usecase.
|
||||
* `daemon_lib.common`: Part of the PVC daemon libraries, provides several common functions, including, most usefully, `run_os_command` which provides a wrapped, convenient method to call arbitrary shell/OS commands while returning a POSIX returncode, stdout, and stderr (a tuple of the 3 in that order).
|
||||
* `daemon_lib.ceph`: Part of the PVC daemon libraries, provides several commands for managing Ceph RBD volumes, including, but not limited to, `clone_volume`, `add_volume`, `map_volume`, and `unmap_volume`. See the `debootstrap` example for a detailed usage example.
|
||||
|
||||
For safety reasons, the script runs in a modified chroot environment on the hypervisor. It will have full access to the entire / (root partition) of the hypervisor, but read-only. In addition it has read-write access to /dev, /sys, /run, and a fresh /tmp to write to; use /tmp/target (as convention) as the destination for any mounting of volumes and installation. Thus it is not possible to do things like `apt-get install`ing additional programs within a script; any such requirements must be set up before running the script (e.g. via `pvc-ansible`).
|
||||
|
||||
**WARNING**: Of course, despite this "safety" mechanism, it is VERY IMPORTANT to be cognizant that this script runs AS ROOT ON THE HYPERVISOR SYSTEM with FULL ACCESS to the cluster. You should NEVER allow arbitrary, untrusted users the ability to add or modify provisioning scripts. It is trivially easy to write scripts which will do destructive things - for example writing to arbitrary /dev objects, running arbitrary root-level commands, or importing PVC library functions to delete VMs, RBD volumes, or pools. Thus, ensure you vett and understand every script on the system, audit them regularly for both intentional and accidental malicious activity, and of course (to reiterate), do not allow untrusted script creation!
|
||||
|
||||
## Profiles
|
||||
|
||||
Provisioner profiles combine the templates, userdata, and scripts together into dynamic configurations which are then applied to the VM when provisioned. The VM retains the record of this profile name in its configuration for the full lifetime of the VM on the cluster; this is primarily used for cloud-init functionality, but may also serve as a convenient administrator reference.
|
||||
|
||||
Additional arguments to the installation script can be specified along with the profile, to allow further customization of the installation if required.
|
||||
|
||||
#### Examples
|
||||
|
||||
```
|
||||
$ pvc provisioner profile list
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Name ID Templates: System Network Storage Data: Userdata Script Script Arguments
|
||||
std-large 41 ext-lg-ser ext-101 standard-ext4 basic-ssh debootstrap deb_release=buster
|
||||
```
|
||||
|
||||
# Deploying VMs from provisioner scripts
|
||||
|
||||
Once a profile with a Script value is defined, creating VMs with the provisioner is as simple as specifying a VM name and a profile to use.
|
||||
|
||||
```
|
||||
$ pvc provisioner create test1 std-large
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Task ID: af1d0682-53e8-4141-982f-f672e2f23261
|
||||
```
|
||||
|
||||
This will create a worker job on the current primary node, and status can be queried by providing the job ID.
|
||||
|
||||
```
|
||||
$ pvc provisioner status af1d0682-53e8-4141-982f-f672e2f23261
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Job state: RUNNING
|
||||
Stage: 4/10
|
||||
Status: Running script setup() step
|
||||
```
|
||||
|
||||
A list of all running and queued jobs can be obtained by requesting the provisioner status without an ID.
|
||||
|
||||
```
|
||||
$ pvc provisioner status
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Job ID Status Worker VM: Name Profile Define? Start?
|
||||
af1d0682-53e8-4141-982f-f672e2f23261 active celery@pvchv1 test1 std-large True True
|
||||
94abb7fe-41f5-42be-b984-de92854f4b3f pending celery@pvchv1 test2 std-large True True
|
||||
43d57a2d-8d0d-42f6-90df-cc39956825a9 pending celery@pvchv1 testX std-large False False
|
||||
```
|
||||
|
||||
The `--wait` option can be given to the create command. This will cause the command to block and providing a visual progress indicator while the provisioning occurs.
|
||||
|
||||
```
|
||||
$ pvc provisioner create --wait test2 std-large
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Task ID: 94abb7fe-41f5-42be-b984-de92854f4b3f
|
||||
|
||||
Waiting for task to start..... done.
|
||||
|
||||
[####################################] 100% Starting VM
|
||||
|
||||
SUCCESS: VM "test2" with profile "std-large" has been provisioned and started successfully
|
||||
```
|
||||
|
||||
The administrator can also specify whether or not to automatically define and start the VM when launching a provisioner job, using the `--define`/`--no-define` and `--start`/`--no-start` options. The default is to define and start a VM. `--no-define` implies `--no-start` as there would be no VM to start. Using `--no-start` can be useful if other tasks must be performed before starting the VM for the first time, and `--no-define` can be useful for creating "template" VMs which would then be cloned by other profiles.
|
||||
|
||||
```
|
||||
$ pvc provisioner create test3 std-large --no-define
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Task ID: 43d57a2d-8d0d-42f6-90df-cc39956825a9
|
||||
```
|
||||
|
||||
Finally, the administrator may specify further, one-time script arguments at install time, to further tune the VM installation (e.g. setting an FQDN or some conditional to trigger additional steps in the script).
|
||||
|
||||
```
|
||||
$ pvc provisioner create test4 std-large --script-arg vm_fqdn=testhost.example.tld --script-arg my_foo=True
|
||||
Using cluster "local" - Host: "10.0.0.1:7370" Scheme: "http" Prefix: "/api/v1"
|
||||
|
||||
Task ID: 39639f8c-4866-49de-8c51-4179edec0194
|
||||
```
|
||||
|
||||
**NOTE**: A VM that is set to do so will be defined on the cluster early in the provisioning process, before creating disks or executing the provisioning script, with the special status `provision`. Once completed, if the VM is not set to start automatically, the state will remain `provision`, with the VM not running, until its state is explicitly changed with the client (or via autostart when its node returns to `ready` state).
|
||||
|
||||
**NOTE**: Provisioning jobs are tied to the node that spawned them. If the primary node changes, provisioning jobs will continue to run against that node until they are completed, interrupted, or fail, but the active API (now on the new primary node) will not have access to any status data from these jobs, until the primary node status is returned to the original host. The CLI will warn the administrator of this if there are active jobs while running `node primary` or `node secondary` commands.
|
||||
|
||||
**NOTE**: Provisioning jobs cannot be cancelled, either before they start or during execution. The administrator should always let an invalid job either complete or fail out automatically, then remove the erroneous VM with the `vm remove` command.
|
||||
|
||||
# Deploying VMs from OVA images
|
||||
|
||||
PVC supports deploying virtual machines from industry-standard OVA images. OVA images can be uploaded to the cluster with the `pvc provisioner ova` commands, and deployed via the created profile(s) using the `pvc provisioner create` command detailed above for scripted installs; the process is the same in both cases. Additionally, the profile(s) can be modified to suite your specific needs after creation.
|
||||
|
||||
## Uploading an OVA
|
||||
|
||||
Once the OVA is uploaded to the cluster with the `pvc provisioner ova upload` command, it will be visible in two different places:
|
||||
|
||||
* In `pvc provisioner ova list`, one can see all uploaded OVA images as well as details on their disk configurations.
|
||||
|
||||
* In `pvc profile list`, a new profile will be visible which matches the OVA `NAME` from the upload. This profile will have a "Source" of `OVA <NAME>`, and a system template of the same name. This system template will contain the basic configuration of the VM. You may notice that the other templates and data are set to `N/A`. For full details on this, see the next section.
|
||||
|
||||
## The OVA Provisioner Script
|
||||
|
||||
OVA installs leverage a special provisioner script to handle the VM creation, identical to any other provisioner profile type. This (example) script, or a replacement, must be installed prior to uploading an OVA, and handles the actual VM configuration creation and cloning of the OVA volumes.
|
||||
|
||||
## OVA limitations
|
||||
|
||||
PVC does not implement a *complete* OVA framework. While all basic elements of the OVA are included, the following areas require special attention.
|
||||
|
||||
### Networks
|
||||
|
||||
Because the PVC provisioner has its own conception of networks separate from the OVA profiles, the administrator must perform this mapping themselves, by first creating a network template, and the required networks on the PVC cluster, and then modifying the profile of the resulting OVA.
|
||||
|
||||
The provisioner profile for the OVA can be safely modified to include this new network template at any time, and the resulting VM will be provisioned with these networks.
|
||||
|
||||
This setup was chosen specifically to eliminate corner cases. Most OVA images include a single, "default" network interface, and expect the administrator of the hypervisor to modify this later. You can of course do this, but since PVC has its own conception of networks already in the provisioner, it makes more sense to ignore what the OVA specifies, and allow the administrator full control over this portion of the VM config, before deployment. It is thus always important to be aware of the network requirements of your OVA images, especially if they require specific network configurations, and then create a network template to match.
|
||||
|
||||
### Storage
|
||||
|
||||
During import, PVC splits the OVA into its constituent parts, including any disk images (usually VMDK-formatted). It will then create a separate PVC storage volume for each disk image. These storage volumes are then converted at deployment time from the VMDK format to the PVC native raw format based on their included size in the OVA. Once the storage volume for an actual VM deployment is created, it can then be resized as needed.
|
||||
|
||||
Because of this, OVA profiles do not include storage templates like other PVC profiles. A storage template can still be added to such a profile, and the block devices will be added after the main block devices. However, this is generally not recommended; it is far better to modify the OVA to add additional volume(s) before uploading it instead.
|
||||
|
||||
**WARNING**: Never adjust the sizes of the OVA VMDK-formatted storage volumes (named `ova_<NAME>_sdX`) or remove them without removing the OVA itself in the provisioner; doing so will prevent the deployment of the OVA, specifically the conversion of the images to raw format at deploy time, and render the OVA profile useless.
|
@ -1,6 +0,0 @@
|
||||
site_name: Parallel Virtual Cluster documentation
|
||||
theme: readthedocs
|
||||
markdown_extensions:
|
||||
- toc:
|
||||
permalink: yes
|
||||
toc_depth: '1-4'
|
@ -2,23 +2,34 @@
|
||||
|
||||
This directory contains several monitoring resources that can be used with various monitoring systems to track and alert on a PVC cluster system.
|
||||
|
||||
### Munin
|
||||
## Munin
|
||||
|
||||
The included munin plugin can be activated by linking to it from `/etc/munin/plugins/pvc`. By default, this plugin triggers a CRITICAL state when either the PVC or Storage cluster becomes Degraded, and is otherwise OK. The overall health is graphed numerically (Optimal is 0, Maintenance is 1, Degraded is 2) so that the cluster health can be tracked over time.
|
||||
The included Munin plugins can be activated by linking to them from `/etc/munin/plugins/`. Two plugins are provided:
|
||||
|
||||
When using this plugin, it might be useful to adjust the thresholds with a plugin configuration. For instance, one could adjust the Degraded value from CRITICAL to WARNING by adjusting the critical threshold to a value higher than 1.99 (e.g. 3, 10, etc.) so that only the WARNING threshold will be hit. Alternatively one could instead make Maintenance mode trigger a WARNING by lowering the threshold to 0.99.
|
||||
* `pvc`: Checks the PVC cluster and node health, as well as their status (OK/Warning/Critical, based on maintenance status), providing 4 graphs.
|
||||
|
||||
Example plugin configuration:
|
||||
* `ceph_utilization`: Checks the Ceph cluster statistics, providing multiple graphs. Note that this plugin is independent of PVC itself, and makes local calls to various Ceph commands itself.
|
||||
|
||||
```
|
||||
[pvc]
|
||||
# Make cluster warn on maintenance
|
||||
env.pvc_cluster_warning 0.99
|
||||
# Disable critical threshold (>2)
|
||||
env.pvc_cluster_critical 3
|
||||
# Make storage warn on maintenance, crit on degraded (latter is default)
|
||||
env.pvc_storage_warning 0.99
|
||||
env.pvc_storage_critical 1.99
|
||||
```
|
||||
The `pvc` plugin provides no configuration; the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK. The alerting is provided by two separate graphs from the health graph so that actual health state is logged regardless of alerting.
|
||||
|
||||
### Check_MK
|
||||
The `ceph_utilization` plugin provides no configuration; only the cluster utilization graph alerts such that >80% used is warning and >90% used is critical. Ceph itself begins warning above 80% as well.
|
||||
|
||||
## CheckMK
|
||||
|
||||
The included CheckMK plugin is divided into two parts: the agent plugin, and the monitoring server plugin. This monitoring server plugin requires CheckMK version 2.0 or higher. The two parts can be installed as follows:
|
||||
|
||||
* `pvc`: Place this file in the `/usr/lib/check_mk_agent/plugins/` directory on each node.
|
||||
|
||||
* `pvc.py`: Place this file in the `~/local/lib/python3/cmk/base/plugins/agent_based/` directory on the CheckMK monitoring host for each monitoring site.
|
||||
|
||||
The plugin provides no configuration: the status is hardcoded such that <=90% health is warning, <=50% health is critical, and maintenance state forces OK.
|
||||
|
||||
With both the agent and server plugins installed, you can then run `cmk -II <node>` (or use WATO) to inventory each node, which should produce two new checks:
|
||||
|
||||
* `PVC Cluster`: Provides the cluster-wide health. Note that this will be identical for all nodes in the cluster (i.e. if the cluster health drops, all nodes in the cluster will alert this check).
|
||||
|
||||
* `PVC Node <shortname>`: Provides the per-node health.
|
||||
|
||||
The "Summary" text, shown in the check lists, will be simplistic, only showing the current health percentage.
|
||||
|
||||
The "Details" text, found in the specific check details, will show the full list of problem(s) the check finds, as shown by `pvc status` itself.
|
||||
|
6
node-daemon/monitoring/checkmk/pvc
Executable file
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
# PVC cluster status check for Check_MK (agent-side)
|
||||
|
||||
echo "<<<pvc>>>"
|
||||
pvc --quiet status --format json
|
95
node-daemon/monitoring/checkmk/pvc.py
Normal file
@ -0,0 +1,95 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Check_MK PVC plugin
|
||||
#
|
||||
# Copyright 2017-2021, Joshua Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from .agent_based_api.v1 import *
|
||||
from cmk.base.check_api import host_name
|
||||
from time import time
|
||||
from json import loads
|
||||
|
||||
|
||||
def discover_pvc(section):
|
||||
my_node = host_name().split(".")[0]
|
||||
yield Service(item=f"PVC Node {my_node}")
|
||||
yield Service(item="PVC Cluster")
|
||||
|
||||
|
||||
def check_pvc(item, params, section):
|
||||
state = State.OK
|
||||
summary = "Stuff"
|
||||
details = None
|
||||
data = loads(" ".join(section[0]))
|
||||
my_node = host_name().split(".")[0]
|
||||
|
||||
maintenance_map = {
|
||||
"true": "on",
|
||||
"false": "off",
|
||||
}
|
||||
maintenance = maintenance_map[data["maintenance"]]
|
||||
|
||||
# Node check
|
||||
if item == f"PVC Node {my_node}":
|
||||
my_node = host_name().split(".")[0]
|
||||
node_health = data["node_health"][my_node]["health"]
|
||||
node_messages = data["node_health"][my_node]["messages"]
|
||||
|
||||
summary = f"Node health is {node_health}% (maintenance {maintenance})"
|
||||
|
||||
if len(node_messages) > 0:
|
||||
details = ", ".join(node_messages)
|
||||
|
||||
if node_health <= 50 and maintenance == "off":
|
||||
state = State.CRIT
|
||||
elif node_health <= 90 and maintenance == "off":
|
||||
state = State.WARN
|
||||
else:
|
||||
state = State.OK
|
||||
|
||||
yield Metric(name="node-health", value=node_health)
|
||||
|
||||
# Cluster check
|
||||
elif item == "PVC Cluster":
|
||||
cluster_health = data["cluster_health"]["health"]
|
||||
cluster_messages = data["cluster_health"]["messages"]
|
||||
|
||||
summary = f"Cluster health is {cluster_health}% (maintenance {maintenance})"
|
||||
|
||||
if len(cluster_messages) > 0:
|
||||
details = ", ".join(cluster_messages)
|
||||
|
||||
if cluster_health <= 50 and maintenance == "off":
|
||||
state = State.CRIT
|
||||
elif cluster_health <= 90 and maintenance == "off":
|
||||
state = State.WARN
|
||||
else:
|
||||
state = State.OK
|
||||
|
||||
yield Metric(name="cluster-health", value=cluster_health)
|
||||
|
||||
yield Result(state=state, summary=summary, details=details)
|
||||
return
|
||||
|
||||
|
||||
register.check_plugin(
|
||||
name="pvc",
|
||||
service_name="%s",
|
||||
check_ruleset_name="pvc",
|
||||
discovery_function=discover_pvc,
|
||||
check_function=check_pvc,
|
||||
check_default_parameters={},
|
||||
)
|
@ -7,23 +7,6 @@
|
||||
|
||||
pvc - Plugin to monitor a PVC cluster.
|
||||
|
||||
=head1 CONFIGURATION
|
||||
|
||||
Note that due to how Munin thresholds work, these values must always be slightly less than 1 or 2 respectively,
|
||||
or the alerts will never be triggered.
|
||||
|
||||
Defaults (no config required):
|
||||
|
||||
[pvc]
|
||||
env.warning 1.99
|
||||
env.critical 1.99
|
||||
|
||||
Make degraded cluster WARN only (max value is 2, so 3 effectively disables):
|
||||
|
||||
[pvc]
|
||||
env.pvc_cluster_warning 1.99
|
||||
env.pvc_cluster_critical 3
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Joshua Boniface <joshua@boniface.me>
|
||||
@ -45,7 +28,9 @@ GPLv3
|
||||
|
||||
. "$MUNIN_LIBDIR/plugins/plugin.sh"
|
||||
|
||||
warning=1.99
|
||||
is_multigraph
|
||||
|
||||
warning=0.99
|
||||
critical=1.99
|
||||
|
||||
export PVC_CLIENT_DIR="/run/shm/munin-pvc"
|
||||
@ -53,16 +38,7 @@ PVC_CMD="/usr/bin/pvc --quiet --cluster local status --format json-pretty"
|
||||
JQ_CMD="/usr/bin/jq"
|
||||
|
||||
output_usage() {
|
||||
echo "This plugin outputs numerical values based on the health of the PVC cluster."
|
||||
echo
|
||||
echo "There are separate outputs for both the PVC cluster itself as well as the Ceph storage cluster."
|
||||
echo "In normal operation, i.e. when both clusters are in 'Optimal' state, the plugin returns 0 for"
|
||||
echo "each cluster. When the cluster is placed into 'Maintenance' mode,the plugin returns 1 for each"
|
||||
echo "cluster, and goes into WARN state (limit 0.99); this can be adjusted by overriding the WARNING"
|
||||
echo "threshold of the plugin to something other than 0.99 - note that due to Munin's alerting design,"
|
||||
echo "the warning value must always be very slightly below the whole number. When either cluster"
|
||||
echo "element becomes 'Degraded', the plugin returns 2 for the relevant cluster, which is treated as a"
|
||||
echo "critical. Like the WARNING threshold, this can be overridden, and with the same caveat about limit."
|
||||
echo "This plugin outputs information about a PVC cluster and node"
|
||||
exit 0
|
||||
}
|
||||
|
||||
@ -84,72 +60,102 @@ output_autoconf() {
|
||||
}
|
||||
|
||||
output_config() {
|
||||
echo 'graph_title PVC Clusters'
|
||||
echo 'multigraph pvc_cluster_health'
|
||||
echo 'graph_title PVC Cluster Health'
|
||||
echo 'graph_args --base 1000'
|
||||
echo 'graph_vlabel Count'
|
||||
echo 'graph_vlabel Health%'
|
||||
echo 'graph_category pvc'
|
||||
echo 'graph_period second'
|
||||
echo 'graph_info This graph shows the nodes in the PVC cluster.'
|
||||
echo 'graph_info Health of the PVC cluster'
|
||||
|
||||
echo 'pvc_cluster.label Cluster Degradation'
|
||||
echo 'pvc_cluster.type GAUGE'
|
||||
echo 'pvc_cluster.max 2'
|
||||
echo 'pvc_cluster.info Whether the PVC cluster is in a degraded state.'
|
||||
print_warning pvc_cluster
|
||||
print_critical pvc_cluster
|
||||
echo 'pvc_cluster_health.label Cluster Health'
|
||||
echo 'pvc_cluster_health.type GAUGE'
|
||||
echo 'pvc_cluster_health.max 100'
|
||||
echo 'pvc_cluster_health.min 0'
|
||||
echo 'pvc_cluster_health.info Health of the PVC cluster in %'
|
||||
|
||||
echo 'pvc_storage.label Storage Degradation'
|
||||
echo 'pvc_storage.type GAUGE'
|
||||
echo 'pvc_storage.max 2'
|
||||
echo 'pvc_storage.info Whether the storage cluster is in a degraded state.'
|
||||
print_warning pvc_storage
|
||||
print_critical pvc_storage
|
||||
echo 'multigraph pvc_cluster_alert'
|
||||
echo 'graph_title PVC Cluster Alerting'
|
||||
echo 'graph_args --base 1000'
|
||||
echo 'graph_vlabel State'
|
||||
echo 'graph_category pvc'
|
||||
echo 'graph_info Alerting state of the PVC cluster health'
|
||||
|
||||
echo 'pvc_cluster_alert.label Cluster Health State'
|
||||
echo 'pvc_cluster_alert.type GAUGE'
|
||||
echo 'pvc_cluster_alert.max 2'
|
||||
echo 'pvc_cluster_alert.min 0'
|
||||
echo 'pvc_cluster_alert.info Alerting state of the PVC cluster health'
|
||||
print_warning pvc_cluster_alert
|
||||
print_critical pvc_cluster_alert
|
||||
|
||||
echo 'multigraph pvc_node_health'
|
||||
echo 'graph_title PVC Node Health'
|
||||
echo 'graph_args --base 1000'
|
||||
echo 'graph_vlabel Health%'
|
||||
echo 'graph_category pvc'
|
||||
echo 'graph_info Health of the PVC node'
|
||||
|
||||
echo 'pvc_node_health.label Node Health'
|
||||
echo 'pvc_node_health.type GAUGE'
|
||||
echo 'pvc_node_health.max 100'
|
||||
echo 'pvc_node_health.min 0'
|
||||
echo 'pvc_node_health.info Health of the PVC node in %'
|
||||
|
||||
echo 'multigraph pvc_node_alert'
|
||||
echo 'graph_title PVC Node Alerting'
|
||||
echo 'graph_args --base 1000'
|
||||
echo 'graph_vlabel State'
|
||||
echo 'graph_category pvc'
|
||||
echo 'graph_info Alerting state of the PVC node health'
|
||||
|
||||
echo 'pvc_node_alert.label Node Health State'
|
||||
echo 'pvc_node_alert.type GAUGE'
|
||||
echo 'pvc_node_alert.max 2'
|
||||
echo 'pvc_node_alert.min 0'
|
||||
echo 'pvc_node_alert.info Alerting state of the PVC node health'
|
||||
print_warning pvc_node_alert
|
||||
print_critical pvc_node_alert
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
output_values() {
|
||||
PVC_OUTPUT="$( $PVC_CMD )"
|
||||
HOST="$( hostname --short )"
|
||||
|
||||
cluster_health="$( $JQ_CMD '.health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||
cluster_failed_reason="$( $JQ_CMD -r '.health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
||||
case $cluster_health in
|
||||
"Optimal")
|
||||
cluster_value="0"
|
||||
;;
|
||||
"Maintenance")
|
||||
cluster_value="1"
|
||||
;;
|
||||
"Degraded")
|
||||
cluster_value="2"
|
||||
esac
|
||||
is_maintenance="$( $JQ_CMD ".maintenance" <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||
|
||||
storage_health="$( $JQ_CMD '.storage_health' <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||
storage_failed_reason="$( $JQ_CMD -r '.storage_health_msg | @csv' <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
||||
case $storage_health in
|
||||
"Optimal")
|
||||
storage_value="0"
|
||||
;;
|
||||
"Maintenance")
|
||||
storage_value="1"
|
||||
;;
|
||||
"Degraded")
|
||||
storage_value="2"
|
||||
esac
|
||||
cluster_health="$( $JQ_CMD ".cluster_health.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||
cluster_health_messages="$( $JQ_CMD -r ".cluster_health.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
||||
echo 'multigraph pvc_cluster_health'
|
||||
echo "pvc_cluster_health.value ${cluster_health}"
|
||||
echo "pvc_cluster_health.extinfo ${cluster_health_messages}"
|
||||
|
||||
if [[ ${cluster_health} -le 50 && ${is_maintenance} == "false" ]]; then
|
||||
cluster_health_alert=2
|
||||
elif [[ ${cluster_health} -le 90 && ${is_maintenance} == "false" ]]; then
|
||||
cluster_health_alert=1
|
||||
else
|
||||
cluster_health_alert=0
|
||||
fi
|
||||
echo 'multigraph pvc_cluster_alert'
|
||||
echo "pvc_cluster_alert.value ${cluster_health_alert}"
|
||||
|
||||
echo "pvc_cluster.value $cluster_value"
|
||||
if [[ $cluster_value -eq 1 ]]; then
|
||||
echo "pvc_cluster.extinfo Cluster in maintenance mode"
|
||||
elif [[ $cluster_value -eq 2 ]]; then
|
||||
echo "pvc_cluster.extinfo ${cluster_failed_reason}"
|
||||
fi
|
||||
echo "pvc_storage.value $storage_value"
|
||||
if [[ $storage_value -eq 1 ]]; then
|
||||
echo "pvc_storage.extinfo Cluster in maintenance mode"
|
||||
elif [[ $storage_value -eq 2 ]]; then
|
||||
echo "pvc_storage.extinfo ${storage_failed_reason}"
|
||||
fi
|
||||
node_health="$( $JQ_CMD ".node_health.${HOST}.health" <<<"${PVC_OUTPUT}" | tr -d '"' )"
|
||||
node_health_messages="$( $JQ_CMD -r ".node_health.${HOST}.messages | @csv" <<<"${PVC_OUTPUT}" | tr -d '"' | sed 's/,/, /g' )"
|
||||
echo 'multigraph pvc_node_health'
|
||||
echo "pvc_node_health.value ${node_health}"
|
||||
echo "pvc_node_health.extinfo ${node_health_messages}"
|
||||
|
||||
if [[ ${node_health} -le 50 && ${is_maintenance} != "true" ]]; then
|
||||
node_health_alert=2
|
||||
elif [[ ${node_health} -le 90 && ${is_maintenance} != "true" ]]; then
|
||||
node_health_alert=1
|
||||
else
|
||||
node_health_alert=0
|
||||
fi
|
||||
echo 'multigraph pvc_node_alert'
|
||||
echo "pvc_node_alert.value ${node_health_alert}"
|
||||
}
|
||||
|
||||
case $# in
|
||||
|
169
node-daemon/plugins/disk
Normal file
@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# disk.py - PVC Monitoring example plugin for disk (system + OSD)
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check the system and OSD disks for errors and faults and return
|
||||
# a health delta corresponding to severity.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "disk"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
from daemon_lib.common import run_os_command
|
||||
from json import loads
|
||||
|
||||
_, _all_disks, _ = run_os_command("lsblk --json --paths --include 8,259")
|
||||
try:
|
||||
all_disks = loads(_all_disks)
|
||||
except Exception as e:
|
||||
return f"Error loading lsblk JSON: {e}"
|
||||
|
||||
disk_details = list()
|
||||
|
||||
def get_smartinfo(disk, extra_opt=""):
|
||||
_, _smart_info, _ = run_os_command(f"smartctl --info --json {extra_opt} {disk}")
|
||||
try:
|
||||
smart_info = loads(_smart_info)
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
return smart_info
|
||||
|
||||
for disk in [disk["name"] for disk in all_disks['blockdevices']]:
|
||||
extra_opt = ""
|
||||
smart_info = get_smartinfo(disk)
|
||||
if smart_info is None or smart_info["smartctl"]["exit_status"] > 1:
|
||||
continue
|
||||
elif smart_info["smartctl"]["exit_status"] == 1:
|
||||
if "requires option" in smart_info["smartctl"]["messages"][0]["string"]:
|
||||
extra_opt = smart_info["smartctl"]["messages"][0]["string"].split("'")[1].replace('N','0')
|
||||
smart_info = get_smartinfo(disk, extra_opt)
|
||||
if smart_info is None or smart_info["smartctl"]["exit_status"] > 0:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
|
||||
disk_type = smart_info["device"]["type"]
|
||||
|
||||
disk_details.append((disk, extra_opt, disk_type))
|
||||
|
||||
self.disk_details = disk_details
|
||||
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Re-run setup each time to ensure the disk details are current
|
||||
self.setup()
|
||||
|
||||
# Run any imports first
|
||||
from daemon_lib.common import run_os_command
|
||||
from json import loads
|
||||
|
||||
health_delta = 0
|
||||
messages = list()
|
||||
|
||||
for _disk in self.disk_details:
|
||||
disk = _disk[0]
|
||||
extra_opt = _disk[1]
|
||||
disk_type = _disk[2]
|
||||
|
||||
_, _smart_info, _ = run_os_command(f"smartctl --all --json {extra_opt} {disk}")
|
||||
try:
|
||||
smart_info = loads(_smart_info)
|
||||
except Exception as e:
|
||||
health_delta += 10
|
||||
messages.append(f"{disk} failed to load SMART data")
|
||||
continue
|
||||
|
||||
if disk_type == 'nvme':
|
||||
for attribute in smart_info.get('nvme_smart_health_information_log', {}).items():
|
||||
if attribute[0] == "critical_warning" and attribute[1] > 0:
|
||||
health_delta += 10
|
||||
messages.append(f"{disk} critical warning value {attribute[1]}")
|
||||
if attribute[0] == "media_errors" and attribute[1] > 0:
|
||||
health_delta += 10
|
||||
messages.append(f"{disk} media errors value {attribute[1]}")
|
||||
if attribute[0] == "percentage_used" and attribute[1] > 90:
|
||||
health_delta += 10
|
||||
messages.append(f"{disk} percentage used value {attribute[1]}%")
|
||||
else:
|
||||
for attribute in smart_info.get('ata_smart_attributes', {}).get('table', []):
|
||||
if attribute["when_failed"]:
|
||||
health_delta += 10
|
||||
messages.append(f"{disk} attribute {attribute['name']} value {attribute['raw']['value']}")
|
||||
|
||||
if len(messages) < 1:
|
||||
messages.append(f"All {len(self.disk_details)} checked disks report OK: {', '.join([disk[0] for disk in self.disk_details])}")
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(', '.join(messages))
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
162
node-daemon/plugins/dpkg
Normal file
@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# dpkg.py - PVC Monitoring example plugin for dpkg status
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check the system dpkg status is as expected, with no invalid
|
||||
# packages or obsolete configuration files, and will return a 1 health delta for each
|
||||
# flaw in invalid packages, upgradable packages, and obsolete config files.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "dpkg"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
from re import match
|
||||
import daemon_lib.common as pvc_common
|
||||
|
||||
# Get Debian version
|
||||
with open('/etc/debian_version', 'r') as fh:
|
||||
debian_version = fh.read().strip()
|
||||
|
||||
# Get a list of dpkg packages for analysis
|
||||
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/dpkg --list")
|
||||
|
||||
# Get a list of installed packages and states
|
||||
packages = list()
|
||||
for dpkg_line in stdout.split('\n'):
|
||||
if match('^[a-z][a-z] ', dpkg_line):
|
||||
line_split = dpkg_line.split()
|
||||
package_state = line_split[0]
|
||||
package_name = line_split[1]
|
||||
packages.append((package_name, package_state))
|
||||
|
||||
count_ok = 0
|
||||
count_inconsistent = 0
|
||||
list_inconsistent = list()
|
||||
|
||||
for package in packages:
|
||||
if package[1] == "ii":
|
||||
count_ok += 1
|
||||
else:
|
||||
count_inconsistent += 1
|
||||
list_inconsistent.append(package[0])
|
||||
|
||||
# Get upgradable packages
|
||||
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/apt list --upgradable")
|
||||
|
||||
list_upgradable = list()
|
||||
for apt_line in stdout.split('\n'):
|
||||
if match('^[a-z][a-z] ', apt_line):
|
||||
line_split = apt_line.split('/')
|
||||
package_name = line_split[0]
|
||||
list_upgradable.append(package_name)
|
||||
|
||||
count_upgradable = len(list_upgradable)
|
||||
|
||||
# Get obsolete config files (dpkg-*, ucf-*, or update-* under /etc)
|
||||
retcode, stdout, stderr = pvc_common.run_os_command("/usr/bin/find /etc -type f -a \( -name '*.dpkg-*' -o -name '*.ucf-*' -o -name '*.update-*' \)")
|
||||
|
||||
obsolete_conffiles = list()
|
||||
for conffile_line in stdout.split('\n'):
|
||||
if conffile_line:
|
||||
obsolete_conffiles.append(conffile_line)
|
||||
|
||||
count_obsolete_conffiles = len(obsolete_conffiles)
|
||||
|
||||
# Set health_delta based on the results
|
||||
health_delta = 0
|
||||
if count_inconsistent > 0:
|
||||
health_delta += 1
|
||||
if count_upgradable > 0:
|
||||
health_delta += 1
|
||||
if count_obsolete_conffiles > 0:
|
||||
health_delta += 1
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Craft the message
|
||||
message = f"Debian {debian_version}; Obsolete conffiles: {count_obsolete_conffiles}; Packages inconsistent: {count_inconsistent}, upgradable: {count_upgradable}"
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(message)
|
||||
|
||||
# Set the detailed data in our local PluginResult object
|
||||
detailed_data = {
|
||||
"debian_version": debian_version,
|
||||
"obsolete_conffiles": obsolete_conffiles,
|
||||
"inconsistent_packages": list_inconsistent,
|
||||
"upgradable_packages": list_upgradable,
|
||||
}
|
||||
self.plugin_result.set_data(detailed_data)
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
108
node-daemon/plugins/edac
Normal file
@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# edac.py - PVC Monitoring example plugin for EDAC
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check the system's EDAC registers and report any failures.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "edac"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
import daemon_lib.common as common
|
||||
from re import match, search
|
||||
|
||||
# Get edac-util output
|
||||
retcode, stdout, stderr = common.run_os_command('/usr/bin/edac-util')
|
||||
|
||||
# If there's no errors, we're OK
|
||||
if match(r'^edac-util: No errors to report.', stdout):
|
||||
health_delta = 0
|
||||
message = "EDAC reports no errors"
|
||||
else:
|
||||
health_delta = 0
|
||||
message = "EDAC reports errors: "
|
||||
errors = list()
|
||||
for line in stdout.split('\n'):
|
||||
if match(r'^mc[0-9]: csrow', line):
|
||||
if 'Uncorrected' in line:
|
||||
health_delta = 50
|
||||
errors.append(' '.join(line.split()[2:]))
|
||||
message += ', '.join(errors)
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(message)
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
247
node-daemon/plugins/hwrd
Normal file
@ -0,0 +1,247 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# hwrd.py - PVC Monitoring example plugin for hardware RAID Arrays
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2023 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check any hardwrae RAID virtual disks for health and report errors.
|
||||
# Supports Dell BOSS cards, LSI/Avago/Broadcom MegaRAID, and HP SmartArray RAID.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "hwrd"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def check_dellboss(self):
|
||||
# Run any imports first
|
||||
from daemon_lib.common import run_os_command
|
||||
from re import match
|
||||
|
||||
health_delta = 0
|
||||
messages = list()
|
||||
|
||||
_dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
|
||||
if _dellboss_ret != 0:
|
||||
health_delta = 50
|
||||
messages.append("Error running MVCLI command")
|
||||
else:
|
||||
arrays = list()
|
||||
idx = None
|
||||
|
||||
for line in _dellboss_list.split('\n'):
|
||||
if match(r"^id:", line):
|
||||
idx = int(line.split(":")[-1].strip())
|
||||
arrays.append(dict())
|
||||
if match(r"^name:", line):
|
||||
arrays[idx]["name"] = line.split(":")[-1].strip()
|
||||
if match(r"^status:", line):
|
||||
arrays[idx]["status"] = line.split(":")[-1].strip()
|
||||
|
||||
for idx, array in enumerate(arrays):
|
||||
if array["status"] != "functional":
|
||||
health_delta += 50
|
||||
messages.append(f"RAID Dell BOSS ID {idx} (Name: {array['name']}, State: {array['status']})")
|
||||
|
||||
if len(messages) < 1:
|
||||
messages.append(f"No valid RAID arrays found")
|
||||
|
||||
return health_delta, messages
|
||||
|
||||
def check_megaraid(self):
|
||||
# Run any imports first
|
||||
from daemon_lib.common import run_os_command
|
||||
from re import match
|
||||
|
||||
health_delta = 0
|
||||
messages = list()
|
||||
|
||||
_megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
|
||||
if _megaraid_ret != 0:
|
||||
health_delta = 50
|
||||
messages.append("Error running MegaCLI command")
|
||||
else:
|
||||
vd_list = _megaraid_list.split('\n\n\n')
|
||||
for idx, _vd in enumerate(vd_list):
|
||||
vd = _vd.split('\n')
|
||||
if "Virtual Drive Information" not in vd[2]:
|
||||
continue
|
||||
|
||||
raid_name = None
|
||||
raid_count = 0
|
||||
raid_state = None
|
||||
|
||||
for entry in vd:
|
||||
if len(entry.split(':')) < 2:
|
||||
continue
|
||||
|
||||
entry_key = entry.split(':')[0].strip()
|
||||
entry_value = entry.split(':')[1].strip()
|
||||
|
||||
if entry_key == "State":
|
||||
raid_state = entry_value
|
||||
if entry_key == "Name":
|
||||
raid_name = entry_value
|
||||
if entry_key == "Number Of Drives":
|
||||
raid_count = entry_value
|
||||
|
||||
if raid_state is None or raid_name is None or raid_count == 0:
|
||||
health_delta += 10
|
||||
messages.append(f"RAID ID {idx} did not report useful values")
|
||||
continue
|
||||
|
||||
if raid_state != "Optimal":
|
||||
health_delta += 50
|
||||
messages.append(f"RAID MegaRAID ID {idx} (Name: {raid_name}, Disks: {raid_count}, State: {raid_state})")
|
||||
|
||||
if len(messages) < 1:
|
||||
messages.append(f"No valid RAID arrays found")
|
||||
|
||||
return health_delta, messages
|
||||
|
||||
def check_hpsa(self):
|
||||
# Run any imports first
|
||||
from daemon_lib.common import run_os_command
|
||||
from re import match, findall
|
||||
|
||||
health_delta = 0
|
||||
messages = list()
|
||||
|
||||
_hparray_ret, _hparray_list, _ = run_os_command(f"ssacli ctrl slot={self.controller_slot} ld all show")
|
||||
if _hparray_ret != 0:
|
||||
health_delta = 50
|
||||
messages.append("Error running SSACLI command")
|
||||
else:
|
||||
vd_lines = _hparray_list.split('\n\n')
|
||||
|
||||
arrays = list()
|
||||
cur_array = None
|
||||
for idx, _line in enumerate(vd_lines):
|
||||
line = _line.strip()
|
||||
if match(r"^Array", line):
|
||||
cur_array = line
|
||||
if match(r"^logicaldrive", line) and cur_array is not None:
|
||||
arrays.append(f"{cur_array} {line}")
|
||||
|
||||
for array in arrays:
|
||||
if "OK" not in array:
|
||||
health_delta += 50
|
||||
messages.append(f"RAID HPSA {array}")
|
||||
|
||||
if len(messages) < 1:
|
||||
messages.append(f"No valid RAID arrays found")
|
||||
|
||||
return health_delta, messages
|
||||
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
from daemon_lib.common import run_os_command
|
||||
from re import match, findall
|
||||
|
||||
self.raid_type = list()
|
||||
|
||||
_dellboss_ret, _dellboss_list, _ = run_os_command("mvcli info -o vd")
|
||||
if _dellboss_ret == 0:
|
||||
# If this returns 0 at all, there's a valid BOSS card to manage
|
||||
self.raid_type.append("dellboss")
|
||||
|
||||
_megaraid_ret, _megaraid_list, _ = run_os_command("megacli -LDInfo -Lall -aALL")
|
||||
if _megaraid_ret == 0:
|
||||
vd_list = _megaraid_list.split('\n\n\n')
|
||||
for idx, _vd in enumerate(vd_list):
|
||||
vd = _vd.split('\n')
|
||||
if "Virtual Drive Information" in vd[2]:
|
||||
self.raid_type.append("megaraid")
|
||||
|
||||
_hpraid_ret, _hpraid_list, _ = run_os_command("ssacli ctrl all show status")
|
||||
if _hpraid_ret == 0:
|
||||
for line in _hpraid_list.split('\n'):
|
||||
if match(r"^Smart", line):
|
||||
controller_slots = findall("Slot ([0-9])", line)
|
||||
if len(controller_slots) > 0:
|
||||
self.raid_type.append("hpsa")
|
||||
self.controller_slot = controller_slots[0]
|
||||
|
||||
if len(self.raid_type) < 1:
|
||||
return "No hardware RAID management commands found"
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
"""
|
||||
|
||||
health_delta = 0
|
||||
messages = list()
|
||||
|
||||
raid_function_map = {
|
||||
"megaraid": self.check_megaraid,
|
||||
"hpsa": self.check_hpsa,
|
||||
"dellboss": self.check_dellboss,
|
||||
}
|
||||
|
||||
for raid_type in self.raid_type:
|
||||
_health_delta, _messages = raid_function_map.get(raid_type)()
|
||||
health_delta += _health_delta
|
||||
messages += _messages
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(', '.join(messages))
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
109
node-daemon/plugins/ipmi
Normal file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# ipmi.py - PVC Monitoring example plugin for IPMI
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check whether the system IPMI is reachable.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "ipmi"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not ipmi in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
from daemon_lib.common import run_os_command
|
||||
|
||||
# Check the node's IPMI interface
|
||||
ipmi_hostname = self.config["ipmi_hostname"]
|
||||
ipmi_username = self.config["ipmi_username"]
|
||||
ipmi_password = self.config["ipmi_password"]
|
||||
retcode, _, _ = run_os_command(
|
||||
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_username} -P {ipmi_password} chassis power status",
|
||||
timeout=2
|
||||
)
|
||||
|
||||
if retcode > 0:
|
||||
# Set the health delta to 10 (subtract 10 from the total of 100)
|
||||
health_delta = 10
|
||||
# Craft a message that can be used by the clients
|
||||
message = f"IPMI via {ipmi_username}@{ipmi_hostname} is NOT responding"
|
||||
else:
|
||||
# Set the health delta to 0 (no change)
|
||||
health_delta = 0
|
||||
# Craft a message that can be used by the clients
|
||||
message = f"IPMI via {ipmi_username}@{ipmi_hostname} is responding"
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(message)
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
107
node-daemon/plugins/lbvt
Normal file
@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# lbvt.py - PVC Monitoring example plugin for Libvirtd
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check the Libvirt daemon instance on the node for operation.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "lbvt"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not lbvt in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
from libvirt import openReadOnly as lvopen
|
||||
|
||||
lv_conn = None
|
||||
|
||||
# Set the health delta to 0 (no change)
|
||||
health_delta = 0
|
||||
# Craft a message that can be used by the clients
|
||||
message = "Successfully connected to Libvirtd on localhost"
|
||||
|
||||
# Check the Zookeeper connection
|
||||
try:
|
||||
lv_conn = lvopen(f"qemu+tcp://{self.this_node.name}/system")
|
||||
data = lv_conn.getHostname()
|
||||
except Exception as e:
|
||||
health_delta = 50
|
||||
message = f"Failed to connect to Libvirtd: {e}"
|
||||
finally:
|
||||
if lv_conn is not None:
|
||||
lv_conn.close()
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(message)
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
109
node-daemon/plugins/load
Normal file
@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# load.py - PVC Monitoring example plugin for load
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check the system load against the total number of CPU cores.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "load"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
from os import getloadavg
|
||||
from psutil import cpu_count
|
||||
|
||||
# Get the current 1-minute system load average
|
||||
load_average = float(round(getloadavg()[0], 2))
|
||||
|
||||
# Get the number of CPU cores
|
||||
cpu_cores = cpu_count()
|
||||
|
||||
# Check that the load average is greater or equal to the cpu count
|
||||
if load_average > float(cpu_cores):
|
||||
# Set the health delta to 10 (subtract 10 from the total of 100)
|
||||
health_delta = 50
|
||||
# Craft a message that can be used by the clients
|
||||
message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
|
||||
|
||||
else:
|
||||
# Set the health delta to 0 (no change)
|
||||
health_delta = 0
|
||||
# Craft a message that can be used by the clients
|
||||
message = f"Current load is {load_average} out of {cpu_cores} CPU cores"
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(message)
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
198
node-daemon/plugins/nics
Normal file
@ -0,0 +1,198 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# nics.py - PVC Monitoring example plugin for NIC interfaces
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check the network interfaces of the host, specifically for speed
|
||||
# and 802.3ad status (if applicable).
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "nics"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
import daemon_lib.common as common
|
||||
from re import match, search, findall
|
||||
|
||||
messages = list()
|
||||
health_delta = 0
|
||||
|
||||
# Get a list of the various underlying devices
|
||||
_core_nics = set()
|
||||
|
||||
for dev in [
|
||||
self.config['bridge_dev'],
|
||||
self.config['upstream_dev'],
|
||||
self.config['cluster_dev'],
|
||||
self.config['storage_dev'],
|
||||
]:
|
||||
with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
|
||||
_devtype = uevent.readlines()[0].split('=')[-1].strip()
|
||||
|
||||
if _devtype == 'vlan':
|
||||
with open(f"/proc/net/vlan/{dev}") as devfh:
|
||||
vlan_info = devfh.read().split('\n')
|
||||
for line in vlan_info:
|
||||
if match(r'^Device:', line):
|
||||
dev = line.split()[-1]
|
||||
|
||||
_core_nics.add(dev)
|
||||
|
||||
core_nics = sorted(list(_core_nics))
|
||||
|
||||
for dev in core_nics:
|
||||
with open(f'/sys/class/net/{dev}/uevent', 'r') as uevent:
|
||||
_devtype = uevent.readlines()[0].split('=')[-1].strip()
|
||||
|
||||
if _devtype == "bond":
|
||||
syspath = f"/proc/net/bonding/{dev}"
|
||||
|
||||
with open(syspath) as devfh:
|
||||
bonding_stats = devfh.read()
|
||||
|
||||
_, _mode, _info, *_slaves = bonding_stats.split('\n\n')
|
||||
|
||||
slave_interfaces = list()
|
||||
for slavedev in _slaves:
|
||||
lines = slavedev.split('\n')
|
||||
for line in lines:
|
||||
if match(r'^Slave Interface:', line):
|
||||
interface_name = line.split()[-1]
|
||||
if match(r'^MII Status:', line):
|
||||
interface_status = line.split()[-1]
|
||||
if match(r'^Speed:', line):
|
||||
try:
|
||||
interface_speed_mbps = int(line.split()[-2])
|
||||
except Exception:
|
||||
interface_speed_mbps = 0
|
||||
if match(r'^Duplex:', line):
|
||||
interface_duplex = line.split()[-1]
|
||||
slave_interfaces.append((interface_name, interface_status, interface_speed_mbps, interface_duplex))
|
||||
|
||||
# Ensure at least 2 slave interfaces are up
|
||||
slave_interface_up_count = 0
|
||||
for slave_interface in slave_interfaces:
|
||||
if slave_interface[1] == 'up':
|
||||
slave_interface_up_count += 1
|
||||
if slave_interface_up_count < 2:
|
||||
messages.append(f"{dev} DEGRADED with {slave_interface_up_count} active slaves")
|
||||
health_delta += 10
|
||||
else:
|
||||
messages.append(f"{dev} OK with {slave_interface_up_count} active slaves")
|
||||
|
||||
# Get ethtool supported speeds for slave interfaces
|
||||
supported_link_speeds = set()
|
||||
for slave_interface in slave_interfaces:
|
||||
slave_dev = slave_interface[0]
|
||||
_, ethtool_stdout, _ = common.run_os_command(f"ethtool {slave_dev}")
|
||||
in_modes = False
|
||||
for line in ethtool_stdout.split('\n'):
|
||||
if search('Supported link modes:', line):
|
||||
in_modes = True
|
||||
if search('Supported pause frame use:', line):
|
||||
in_modes = False
|
||||
break
|
||||
if in_modes:
|
||||
speed = int(findall(r'\d+', line.split()[-1])[0])
|
||||
supported_link_speeds.add(speed)
|
||||
else:
|
||||
# Get ethtool supported speeds for interface
|
||||
supported_link_speeds = set()
|
||||
_, ethtool_stdout, _ = common.run_os_command(f"ethtool {dev}")
|
||||
in_modes = False
|
||||
for line in ethtool_stdout.split('\n'):
|
||||
if search('Supported link modes:', line):
|
||||
in_modes = True
|
||||
if search('Supported pause frame use:', line):
|
||||
in_modes = False
|
||||
break
|
||||
if in_modes:
|
||||
speed = int(line.split()[-1].replace('baseT', '').split('/')[0])
|
||||
supported_link_speeds.add(speed)
|
||||
|
||||
max_supported_link_speed = sorted(list(supported_link_speeds))[-1]
|
||||
|
||||
# Ensure interface is running at its maximum speed
|
||||
with open(f"/sys/class/net/{dev}/speed") as devfh:
|
||||
dev_speed = int(devfh.read())
|
||||
if dev_speed < max_supported_link_speed:
|
||||
messages.append(f"{dev} DEGRADED at {dev_speed} Mbps")
|
||||
health_delta += 10
|
||||
else:
|
||||
messages.append(f"{dev} OK at {dev_speed} Mbps")
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(', '.join(messages))
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
141
node-daemon/plugins/psql
Normal file
@ -0,0 +1,141 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# psql.py - PVC Monitoring example plugin for Postgres/Patroni
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check the Patroni PostgreSQL instance on the node for operation.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "psql"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
from psycopg2 import connect
|
||||
|
||||
conn_metadata = None
|
||||
cur_metadata = None
|
||||
conn_dns = None
|
||||
cur_dns = None
|
||||
|
||||
# Set the health delta to 0 (no change)
|
||||
health_delta = 0
|
||||
# Craft a message that can be used by the clients
|
||||
message = "Successfully connected to PostgreSQL databases on localhost"
|
||||
|
||||
# Check the Metadata database (primary)
|
||||
try:
|
||||
conn_metadata = connect(
|
||||
host=self.this_node.name,
|
||||
port=self.config["metadata_postgresql_port"],
|
||||
dbname=self.config["metadata_postgresql_dbname"],
|
||||
user=self.config["metadata_postgresql_user"],
|
||||
password=self.config["metadata_postgresql_password"],
|
||||
)
|
||||
cur_metadata = conn_metadata.cursor()
|
||||
cur_metadata.execute("""SELECT * FROM alembic_version""")
|
||||
data = cur_metadata.fetchone()
|
||||
except Exception as e:
|
||||
health_delta = 50
|
||||
err = str(e).split('\n')[0]
|
||||
message = f"Failed to connect to PostgreSQL database {self.config['metadata_postgresql_dbname']}: {err}"
|
||||
finally:
|
||||
if cur_metadata is not None:
|
||||
cur_metadata.close()
|
||||
if conn_metadata is not None:
|
||||
conn_metadata.close()
|
||||
|
||||
if health_delta == 0:
|
||||
# Check the PowerDNS database (secondary)
|
||||
try:
|
||||
conn_pdns = connect(
|
||||
host=self.this_node.name,
|
||||
port=self.config["pdns_postgresql_port"],
|
||||
dbname=self.config["pdns_postgresql_dbname"],
|
||||
user=self.config["pdns_postgresql_user"],
|
||||
password=self.config["pdns_postgresql_password"],
|
||||
)
|
||||
cur_pdns = conn_pdns.cursor()
|
||||
cur_pdns.execute("""SELECT * FROM supermasters""")
|
||||
data = cur_pdns.fetchone()
|
||||
except Exception as e:
|
||||
health_delta = 50
|
||||
err = str(e).split('\n')[0]
|
||||
message = f"Failed to connect to PostgreSQL database {self.config['pdns_postgresql_dbname']}: {err}"
|
||||
finally:
|
||||
if cur_pdns is not None:
|
||||
cur_pdns.close()
|
||||
if conn_pdns is not None:
|
||||
conn_pdns.close()
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(message)
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|
138
node-daemon/plugins/psur
Normal file
@ -0,0 +1,138 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# psur.py - PVC Monitoring example plugin for PSU Redundancy
|
||||
# Part of the Parallel Virtual Cluster (PVC) system
|
||||
#
|
||||
# Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
# This script provides an example of a PVC monitoring plugin script. It will create
|
||||
# a simple plugin to check IPMI for power supply reundancy status.
|
||||
|
||||
# This script can thus be used as an example or reference implementation of a
|
||||
# PVC monitoring pluginscript and expanded upon as required.
|
||||
|
||||
# A monitoring plugin script must implement the class "MonitoringPluginScript" which
|
||||
# extends "MonitoringPlugin", providing the 3 functions indicated. Detailed explanation
|
||||
# of the role of each function is provided in context of the example; see the other
|
||||
# examples for more potential uses.
|
||||
|
||||
# WARNING:
|
||||
#
|
||||
# This script will run in the context of the node daemon keepalives as root.
|
||||
# DO NOT install untrusted, unvetted plugins under any circumstances.
|
||||
|
||||
|
||||
# This import is always required here, as MonitoringPlugin is used by the
|
||||
# MonitoringPluginScript class
|
||||
from pvcnoded.objects.MonitoringInstance import MonitoringPlugin
|
||||
|
||||
|
||||
# A monitoring plugin script must always expose its nice name, which must be identical to
|
||||
# the file name
|
||||
PLUGIN_NAME = "psur"
|
||||
|
||||
|
||||
# The MonitoringPluginScript class must be named as such, and extend MonitoringPlugin.
|
||||
class MonitoringPluginScript(MonitoringPlugin):
|
||||
def setup(self):
|
||||
"""
|
||||
setup(): Perform special setup steps during node daemon startup
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
|
||||
If you wish for the plugin to not load in certain conditions, do any checks here
|
||||
and return a non-None failure message to indicate the error.
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
from daemon_lib.common import run_os_command
|
||||
from re import match
|
||||
|
||||
_ipmitool_ret, _ipmitool_list, _ = run_os_command("ipmitool sdr type 'Power Supply'")
|
||||
if _ipmitool_ret != 0:
|
||||
return "Error running ipmitool command"
|
||||
else:
|
||||
search_values = [
|
||||
"PS Redundancy", # Dell PowerEdge
|
||||
"Power Supplies", # HP ProLiant
|
||||
"PS_RDNDNT_MODE", # Cisco UCS
|
||||
]
|
||||
reading_lines = [l for l in _ipmitool_list.split('\n') if len(l.split('|')) > 0 and l.split('|')[0].strip() in search_values]
|
||||
if len(reading_lines) < 1:
|
||||
return "No valid input power sensors found"
|
||||
|
||||
def run(self, coordinator_state=None):
|
||||
"""
|
||||
run(): Perform the check actions and return a PluginResult object
|
||||
|
||||
The {coordinator_state} can be used to check if this is a "primary" coordinator, "secondary" coordinator, or "client" (non-coordinator)
|
||||
"""
|
||||
|
||||
# Run any imports first
|
||||
from daemon_lib.common import run_os_command
|
||||
from re import match
|
||||
|
||||
health_delta = 0
|
||||
messages = list()
|
||||
|
||||
_ipmitool_ret, _ipmitool_list, _ = run_os_command("ipmitool sdr type 'Power Supply'")
|
||||
if _ipmitool_ret != 0 or len(_ipmitool_list.split('\n')) < 1:
|
||||
health_delta = 0
|
||||
messages.append("Error running ipmitool command")
|
||||
else:
|
||||
search_values = [
|
||||
"PS Redundancy", # Dell PowerEdge
|
||||
"Power Supplies", # HP ProLiant
|
||||
"PS_RDNDNT_MODE", # Cisco UCS
|
||||
]
|
||||
|
||||
reading_lines = [l for l in _ipmitool_list.split('\n') if len(l.split('|')) > 0 and l.split('|')[0].strip() in search_values]
|
||||
if len(reading_lines) > 0:
|
||||
for reading_line in reading_lines:
|
||||
reading_sensor = reading_line.split('|')[1].strip()
|
||||
reading_text = reading_line.split('|')[-1].strip()
|
||||
|
||||
if reading_text == "Fully Redundant":
|
||||
health_delta += 0
|
||||
messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
|
||||
elif reading_text == "No Reading":
|
||||
health_delta += 5
|
||||
messages.append(f"Input power sensor {reading_sensor} reports {reading_text} (PSU redundancy not configured?)")
|
||||
else:
|
||||
health_delta += 10
|
||||
messages.append(f"Input power sensor {reading_sensor} reports {reading_text}")
|
||||
else:
|
||||
health_delta = 5
|
||||
messages.append("No valid input power sensors found, but configured")
|
||||
|
||||
# Set the health delta in our local PluginResult object
|
||||
self.plugin_result.set_health_delta(health_delta)
|
||||
|
||||
# Set the message in our local PluginResult object
|
||||
self.plugin_result.set_message(', '.join(messages))
|
||||
|
||||
# Return our local PluginResult object
|
||||
return self.plugin_result
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
cleanup(): Perform special cleanup steps during node daemon termination
|
||||
|
||||
This step is optional and should be used sparingly.
|
||||
"""
|
||||
|
||||
pass
|