Reformat code with Black code formatter
Unify the code style along PEP and Black principles using the tool.
This commit is contained in:
@ -35,74 +35,89 @@ def fence_node(node_name, zkhandler, config, logger):
|
||||
failcount = 0
|
||||
while failcount < failcount_limit:
|
||||
# Wait 5 seconds
|
||||
time.sleep(config['keepalive_interval'])
|
||||
time.sleep(config["keepalive_interval"])
|
||||
# Get the state
|
||||
node_daemon_state = zkhandler.read(('node.state.daemon', node_name))
|
||||
node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
|
||||
# Is it still 'dead'
|
||||
if node_daemon_state == 'dead':
|
||||
if node_daemon_state == "dead":
|
||||
failcount += 1
|
||||
logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='s')
|
||||
logger.out(
|
||||
'Node "{}" failed {}/{} saving throws'.format(
|
||||
node_name, failcount, failcount_limit
|
||||
),
|
||||
state="s",
|
||||
)
|
||||
# It changed back to something else so it must be alive
|
||||
else:
|
||||
logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o')
|
||||
logger.out(
|
||||
'Node "{}" passed a saving throw; canceling fence'.format(node_name),
|
||||
state="o",
|
||||
)
|
||||
return
|
||||
|
||||
logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state='s')
|
||||
logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state="s")
|
||||
|
||||
# Get IPMI information
|
||||
ipmi_hostname = zkhandler.read(('node.ipmi.hostname', node_name))
|
||||
ipmi_username = zkhandler.read(('node.ipmi.username', node_name))
|
||||
ipmi_password = zkhandler.read(('node.ipmi.password', node_name))
|
||||
ipmi_hostname = zkhandler.read(("node.ipmi.hostname", node_name))
|
||||
ipmi_username = zkhandler.read(("node.ipmi.username", node_name))
|
||||
ipmi_password = zkhandler.read(("node.ipmi.password", node_name))
|
||||
|
||||
# Shoot it in the head
|
||||
fence_status = reboot_via_ipmi(ipmi_hostname, ipmi_username, ipmi_password, logger)
|
||||
|
||||
# Hold to ensure the fence takes effect and system stabilizes
|
||||
logger.out('Waiting {}s for fence of node "{}" to take effect'.format(config['keepalive_interval'], node_name), state='i')
|
||||
time.sleep(config['keepalive_interval'])
|
||||
logger.out(
|
||||
'Waiting {}s for fence of node "{}" to take effect'.format(
|
||||
config["keepalive_interval"], node_name
|
||||
),
|
||||
state="i",
|
||||
)
|
||||
time.sleep(config["keepalive_interval"])
|
||||
|
||||
if fence_status:
|
||||
logger.out('Marking node "{}" as fenced'.format(node_name), state='i')
|
||||
logger.out('Marking node "{}" as fenced'.format(node_name), state="i")
|
||||
while True:
|
||||
try:
|
||||
zkhandler.write([
|
||||
(('node.state.daemon', node_name), 'fenced')
|
||||
])
|
||||
zkhandler.write([(("node.state.daemon", node_name), "fenced")])
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Force into secondary network state if needed
|
||||
if node_name in config['coordinators']:
|
||||
logger.out('Forcing secondary status for node "{}"'.format(node_name), state='i')
|
||||
zkhandler.write([
|
||||
(('node.state.router', node_name), 'secondary')
|
||||
])
|
||||
if zkhandler.read('base.config.primary_node') == node_name:
|
||||
zkhandler.write([
|
||||
('base.config.primary_node', 'none')
|
||||
])
|
||||
if node_name in config["coordinators"]:
|
||||
logger.out(
|
||||
'Forcing secondary status for node "{}"'.format(node_name), state="i"
|
||||
)
|
||||
zkhandler.write([(("node.state.router", node_name), "secondary")])
|
||||
if zkhandler.read("base.config.primary_node") == node_name:
|
||||
zkhandler.write([("base.config.primary_node", "none")])
|
||||
|
||||
# If the fence succeeded and successful_fence is migrate
|
||||
if fence_status and config['successful_fence'] == 'migrate':
|
||||
if fence_status and config["successful_fence"] == "migrate":
|
||||
migrateFromFencedNode(zkhandler, node_name, config, logger)
|
||||
|
||||
# If the fence failed and failed_fence is migrate
|
||||
if not fence_status and config['failed_fence'] == 'migrate' and config['suicide_intervals'] != '0':
|
||||
if (
|
||||
not fence_status
|
||||
and config["failed_fence"] == "migrate"
|
||||
and config["suicide_intervals"] != "0"
|
||||
):
|
||||
migrateFromFencedNode(zkhandler, node_name, config, logger)
|
||||
|
||||
|
||||
# Migrate hosts away from a fenced node
|
||||
def migrateFromFencedNode(zkhandler, node_name, config, logger):
|
||||
logger.out('Migrating VMs from dead node "{}" to new hosts'.format(node_name), state='i')
|
||||
logger.out(
|
||||
'Migrating VMs from dead node "{}" to new hosts'.format(node_name), state="i"
|
||||
)
|
||||
|
||||
# Get the list of VMs
|
||||
dead_node_running_domains = zkhandler.read(('node.running_domains', node_name)).split()
|
||||
dead_node_running_domains = zkhandler.read(
|
||||
("node.running_domains", node_name)
|
||||
).split()
|
||||
|
||||
# Set the node to a custom domainstate so we know what's happening
|
||||
zkhandler.write([
|
||||
(('node.state.domain', node_name), 'fence-flush')
|
||||
])
|
||||
zkhandler.write([(("node.state.domain", node_name), "fence-flush")])
|
||||
|
||||
# Migrate a VM after a flush
|
||||
def fence_migrate_vm(dom_uuid):
|
||||
@ -111,28 +126,40 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
|
||||
target_node = common.findTargetNode(zkhandler, dom_uuid)
|
||||
|
||||
if target_node is not None:
|
||||
logger.out('Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), state='i')
|
||||
zkhandler.write([
|
||||
(('domain.state', dom_uuid), 'start'),
|
||||
(('domain.node', dom_uuid), target_node),
|
||||
(('domain.last_node', dom_uuid), node_name),
|
||||
])
|
||||
logger.out(
|
||||
'Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node),
|
||||
state="i",
|
||||
)
|
||||
zkhandler.write(
|
||||
[
|
||||
(("domain.state", dom_uuid), "start"),
|
||||
(("domain.node", dom_uuid), target_node),
|
||||
(("domain.last_node", dom_uuid), node_name),
|
||||
]
|
||||
)
|
||||
else:
|
||||
logger.out('No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format(dom_uuid), state='i')
|
||||
zkhandler.write({
|
||||
(('domain.state', dom_uuid), 'stopped'),
|
||||
(('domain.meta.autostart', dom_uuid), 'True'),
|
||||
})
|
||||
logger.out(
|
||||
'No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format(
|
||||
dom_uuid
|
||||
),
|
||||
state="i",
|
||||
)
|
||||
zkhandler.write(
|
||||
{
|
||||
(("domain.state", dom_uuid), "stopped"),
|
||||
(("domain.meta.autostart", dom_uuid), "True"),
|
||||
}
|
||||
)
|
||||
|
||||
# Loop through the VMs
|
||||
for dom_uuid in dead_node_running_domains:
|
||||
fence_migrate_vm(dom_uuid)
|
||||
|
||||
# Set node in flushed state for easy remigrating when it comes back
|
||||
zkhandler.write([
|
||||
(('node.state.domain', node_name), 'flushed')
|
||||
])
|
||||
logger.out('All VMs flushed from dead node "{}" to new hosts'.format(node_name), state='i')
|
||||
zkhandler.write([(("node.state.domain", node_name), "flushed")])
|
||||
logger.out(
|
||||
'All VMs flushed from dead node "{}" to new hosts'.format(node_name), state="i"
|
||||
)
|
||||
|
||||
|
||||
#
|
||||
@ -140,68 +167,100 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
|
||||
#
|
||||
def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger):
|
||||
# Power off the node the node
|
||||
logger.out('Sending power off to dead node', state='i')
|
||||
ipmi_command_stop = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power off'.format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
logger.out("Sending power off to dead node", state="i")
|
||||
ipmi_command_stop = (
|
||||
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power off".format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
)
|
||||
)
|
||||
ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command(
|
||||
ipmi_command_stop
|
||||
)
|
||||
ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command(ipmi_command_stop)
|
||||
|
||||
if ipmi_stop_retcode != 0:
|
||||
logger.out(f'Failed to power off dead node: {ipmi_stop_stderr}', state='e')
|
||||
logger.out(f"Failed to power off dead node: {ipmi_stop_stderr}", state="e")
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
# Check the chassis power state
|
||||
logger.out('Checking power state of dead node', state='i')
|
||||
ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
logger.out("Checking power state of dead node", state="i")
|
||||
ipmi_command_status = (
|
||||
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
)
|
||||
)
|
||||
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
|
||||
ipmi_command_status
|
||||
)
|
||||
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status)
|
||||
if ipmi_status_retcode == 0:
|
||||
logger.out(f'Current chassis power state is: {ipmi_status_stdout.strip()}', state='i')
|
||||
logger.out(
|
||||
f"Current chassis power state is: {ipmi_status_stdout.strip()}", state="i"
|
||||
)
|
||||
else:
|
||||
logger.out(f'Current chassis power state is: Unknown', state='w')
|
||||
logger.out(f"Current chassis power state is: Unknown", state="w")
|
||||
|
||||
# Power on the node
|
||||
logger.out('Sending power on to dead node', state='i')
|
||||
ipmi_command_start = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on'.format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
logger.out("Sending power on to dead node", state="i")
|
||||
ipmi_command_start = (
|
||||
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on".format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
)
|
||||
)
|
||||
ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(
|
||||
ipmi_command_start
|
||||
)
|
||||
ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(ipmi_command_start)
|
||||
|
||||
if ipmi_start_retcode != 0:
|
||||
logger.out(f'Failed to power on dead node: {ipmi_start_stderr}', state='w')
|
||||
logger.out(f"Failed to power on dead node: {ipmi_start_stderr}", state="w")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# Check the chassis power state
|
||||
logger.out('Checking power state of dead node', state='i')
|
||||
ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
logger.out("Checking power state of dead node", state="i")
|
||||
ipmi_command_status = (
|
||||
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format(
|
||||
ipmi_hostname, ipmi_user, ipmi_password
|
||||
)
|
||||
)
|
||||
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
|
||||
ipmi_command_status
|
||||
)
|
||||
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status)
|
||||
|
||||
if ipmi_stop_retcode == 0:
|
||||
if ipmi_status_stdout.strip() == "Chassis Power is on":
|
||||
# We successfully rebooted the node and it is powered on; this is a succeessful fence
|
||||
logger.out('Successfully rebooted dead node', state='o')
|
||||
logger.out("Successfully rebooted dead node", state="o")
|
||||
return True
|
||||
elif ipmi_status_stdout.strip() == "Chassis Power is off":
|
||||
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
|
||||
logger.out('Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush', state='o')
|
||||
logger.out(
|
||||
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush",
|
||||
state="o",
|
||||
)
|
||||
return True
|
||||
else:
|
||||
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
|
||||
logger.out('Chassis power is in an unknown state ({}) after successful IPMI reboot; not performing fence-flush'.format(ipmi_status_stdout.strip()), state='e')
|
||||
logger.out(
|
||||
"Chassis power is in an unknown state ({}) after successful IPMI reboot; not performing fence-flush".format(
|
||||
ipmi_status_stdout.strip()
|
||||
),
|
||||
state="e",
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if ipmi_status_stdout.strip() == "Chassis Power is off":
|
||||
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
|
||||
logger.out('Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush', state='o')
|
||||
logger.out(
|
||||
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush",
|
||||
state="o",
|
||||
)
|
||||
return True
|
||||
else:
|
||||
# We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence
|
||||
logger.out('Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush', state='e')
|
||||
logger.out(
|
||||
"Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush",
|
||||
state="e",
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
@ -209,7 +268,7 @@ def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger):
|
||||
# Verify that IPMI connectivity to this host exists (used during node init)
|
||||
#
|
||||
def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password):
|
||||
ipmi_command = f'/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status'
|
||||
ipmi_command = f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
|
||||
retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2)
|
||||
if retcode == 0 and stdout.strip() == "Chassis Power is on":
|
||||
return True
|
||||
|
Reference in New Issue
Block a user