Add live migrate max downtime selector meta field

Adds a new flag to VM metadata to allow setting the VM live migration
max downtime. This will enable very busy VMs that hang live migration to
have this value changed.
This commit is contained in:
2024-01-10 16:13:31 -05:00
parent 38eeb78423
commit 09269f182c
17 changed files with 283 additions and 30 deletions

View File

@ -441,12 +441,14 @@ def getDomainMetadata(zkhandler, dom_uuid):
domain_node_selector,
domain_node_autostart,
domain_migration_method,
domain_migration_max_downtime,
) = zkhandler.read_many(
[
("domain.meta.node_limit", dom_uuid),
("domain.meta.node_selector", dom_uuid),
("domain.meta.autostart", dom_uuid),
("domain.meta.migrate_method", dom_uuid),
("domain.meta.migrate_max_downtime", dom_uuid),
]
)
@ -464,11 +466,15 @@ def getDomainMetadata(zkhandler, dom_uuid):
if not domain_migration_method or domain_migration_method == "none":
domain_migration_method = None
if not domain_migration_max_downtime or domain_migration_max_downtime == "none":
domain_migration_max_downtime = 300
return (
domain_node_limit,
domain_node_selector,
domain_node_autostart,
domain_migration_method,
domain_migration_max_downtime,
)
@ -505,6 +511,7 @@ def getInformationFromXML(zkhandler, uuid):
domain_node_selector,
domain_node_autostart,
domain_migration_method,
domain_migration_max_downtime,
) = getDomainMetadata(zkhandler, uuid)
domain_tags = getDomainTags(zkhandler, uuid)
@ -565,6 +572,7 @@ def getInformationFromXML(zkhandler, uuid):
"node_selector": domain_node_selector,
"node_autostart": bool(strtobool(domain_node_autostart)),
"migration_method": domain_migration_method,
"migration_max_downtime": int(domain_migration_max_downtime),
"tags": domain_tags,
"description": domain_description,
"profile": domain_profile,

View File

@ -0,0 +1 @@
{"version": "13", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.migrate_max_downtime": "/migration_max_downtime", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}

View File

@ -147,6 +147,7 @@ def define_vm(
node_selector,
node_autostart,
migration_method=None,
migration_max_downtime=300,
profile=None,
tags=[],
initial_state="stop",
@ -272,6 +273,10 @@ def define_vm(
(("domain.console.vnc", dom_uuid), ""),
(("domain.meta.autostart", dom_uuid), node_autostart),
(("domain.meta.migrate_method", dom_uuid), str(migration_method).lower()),
(
("domain.meta.migrate_max_downtime", dom_uuid),
int(migration_max_downtime),
),
(("domain.meta.node_limit", dom_uuid), formatted_node_limit),
(("domain.meta.node_selector", dom_uuid), str(node_selector).lower()),
(("domain.meta.tags", dom_uuid), ""),
@ -305,6 +310,7 @@ def modify_vm_metadata(
node_autostart,
provisioner_profile,
migration_method,
migration_max_downtime,
):
dom_uuid = getDomainUUID(zkhandler, domain)
if not dom_uuid:
@ -331,6 +337,14 @@ def modify_vm_metadata(
(("domain.meta.migrate_method", dom_uuid), str(migration_method).lower())
)
if migration_max_downtime is not None:
update_list.append(
(
("domain.meta.migrate_max_downtime", dom_uuid),
int(migration_max_downtime),
)
)
if len(update_list) < 1:
return False, "ERROR: No updates to apply."
@ -563,6 +577,7 @@ def rename_vm(zkhandler, domain, new_domain):
dom_info["node_selector"],
dom_info["node_autostart"],
migration_method=dom_info["migration_method"],
migration_max_downtime=dom_info["migration_max_downtime"],
profile=dom_info["profile"],
tags=dom_info["tags"],
initial_state="stop",
@ -1624,6 +1639,7 @@ def restore_vm(zkhandler, domain, backup_path, datestring, retain_snapshot=False
backup_source_details["vm_detail"]["node_selector"],
backup_source_details["vm_detail"]["node_autostart"],
backup_source_details["vm_detail"]["migration_method"],
backup_source_details["vm_detail"]["migration_max_downtime"],
backup_source_details["vm_detail"]["profile"],
backup_source_details["vm_detail"]["tags"],
"restore",

View File

@ -744,6 +744,7 @@ def worker_create_vm(
node_selector = vm_data["system_details"]["node_selector"]
node_autostart = vm_data["system_details"]["node_autostart"]
migration_method = vm_data["system_details"]["migration_method"]
migration_max_downtime = vm_data["system_details"]["migration_max_downtime"]
with open_zk(config) as zkhandler:
retcode, retmsg = pvc_vm.define_vm(
zkhandler,
@ -753,6 +754,7 @@ def worker_create_vm(
node_selector,
node_autostart,
migration_method,
migration_max_downtime,
vm_profile,
initial_state="provision",
)

View File

@ -572,7 +572,7 @@ class ZKHandler(object):
#
class ZKSchema(object):
# Current version
_version = 12
_version = 13
# Root for doing nested keys
_schema_root = ""
@ -707,6 +707,7 @@ class ZKSchema(object):
"console.vnc": "/vnc",
"meta.autostart": "/node_autostart",
"meta.migrate_method": "/migration_method",
"meta.migrate_max_downtime": "/migration_max_downtime",
"meta.node_selector": "/node_selector",
"meta.node_limit": "/node_limit",
"meta.tags": "/tags",
@ -1026,6 +1027,8 @@ class ZKSchema(object):
default_data = "False"
elif elem == "pool" and ikey == "tier":
default_data = "default"
elif elem == "domain" and ikey == "meta.migrate_max_downtime":
default_data = "300"
else:
default_data = ""
zkhandler.zk_conn.create(