2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								#!/usr/bin/env python3  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-08-21 02:46:11 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								# fencing.py - Utility functions for pvcnoded fencing  
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								# Part of the Parallel Virtual Cluster (PVC) system  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
									
										
										
										
											2022-10-06 11:55:27 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>  
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    This program is free software: you can redistribute it and/or modify  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    it under the terms of the GNU General Public License as published by  
						 
					
						
							
								
									
										
										
										
											2021-03-25 16:57:17 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								#    the Free Software Foundation, version 3.  
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    This program is distributed in the hope that it will be useful,  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    but WITHOUT ANY WARRANTY; without even the implied warranty of  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    GNU General Public License for more details.  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    You should have received a copy of the GNU General Public License  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#    along with this program.  If not, see <https://www.gnu.org/licenses/>.  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								###############################################################################  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								import  time  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-06-01 12:17:25 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								import  daemon_lib . common  as  common  
						 
					
						
							
								
									
										
										
										
											2021-08-21 02:46:11 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								from  pvcnoded . objects . VMInstance  import  VMInstance  
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-11-07 14:45:24 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								# Fence thread entry function  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
									
										
										
										
											2021-08-21 02:46:11 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  fence_node ( node_name ,  zkhandler ,  config ,  logger ) :  
						 
					
						
							
								
									
										
										
										
											2020-08-05 22:36:28 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # We allow exactly 6 saving throws (30 seconds) for the host to come back online or we kill it 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    failcount_limit  =  6 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    failcount  =  0 
							 
						 
					
						
							
								
									
										
										
										
											2020-08-05 22:36:28 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    while  failcount  <  failcount_limit : 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        # Wait 5 seconds 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        time . sleep ( config [ " keepalive_interval " ] ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        # Get the state 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        node_daemon_state  =  zkhandler . read ( ( " node.state.daemon " ,  node_name ) ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        # Is it still 'dead' 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        if  node_daemon_state  ==  " dead " : 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								            failcount  + =  1 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ' Node  " {} "  failed  {} / {}  saving throws ' . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    node_name ,  failcount ,  failcount_limit 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " s " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								        # It changed back to something else so it must be alive 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        else : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ' Node  " {} "  passed a saving throw; canceling fence ' . format ( node_name ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " o " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								            return 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    logger . out ( ' Fencing node  " {} "  via IPMI reboot signal ' . format ( node_name ) ,  state = " s " ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Get IPMI information 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    ipmi_hostname  =  zkhandler . read ( ( " node.ipmi.hostname " ,  node_name ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_username  =  zkhandler . read ( ( " node.ipmi.username " ,  node_name ) ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_password  =  zkhandler . read ( ( " node.ipmi.password " ,  node_name ) ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Shoot it in the head 
							 
						 
					
						
							
								
									
										
										
										
											2021-08-21 02:46:11 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    fence_status  =  reboot_via_ipmi ( ipmi_hostname ,  ipmi_username ,  ipmi_password ,  logger ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-09-26 20:07:30 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-08-15 12:38:03 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Hold to ensure the fence takes effect and system stabilizes 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ' Waiting  {} s for fence of node  " {} "  to take effect ' . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            config [ " keepalive_interval " ] ,  node_name 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        state = " i " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    time . sleep ( config [ " keepalive_interval " ] ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-27 16:24:17 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-09-26 20:07:30 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  fence_status : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        logger . out ( ' Marking node  " {} "  as fenced ' . format ( node_name ) ,  state = " i " ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-27 16:24:17 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        while  True : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            try : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                zkhandler . write ( [ ( ( " node.state.daemon " ,  node_name ) ,  " fenced " ) ] ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-27 16:24:17 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								                break 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            except  Exception : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                continue 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Force into secondary network state if needed 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  node_name  in  config [ " coordinators " ] : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ' Forcing secondary status for node  " {} " ' . format ( node_name ) ,  state = " i " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        zkhandler . write ( [ ( ( " node.state.router " ,  node_name ) ,  " secondary " ) ] ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  zkhandler . read ( " base.config.primary_node " )  ==  node_name : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            zkhandler . write ( [ ( " base.config.primary_node " ,  " none " ) ] ) 
							 
						 
					
						
							
								
									
										
										
										
											2019-06-25 22:31:04 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    # If the fence succeeded and successful_fence is migrate 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  fence_status  and  config [ " successful_fence " ]  ==  " migrate " : 
							 
						 
					
						
							
								
									
										
										
										
											2021-06-01 11:53:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        migrateFromFencedNode ( zkhandler ,  node_name ,  config ,  logger ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # If the fence failed and failed_fence is migrate 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        not  fence_status 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        and  config [ " failed_fence " ]  ==  " migrate " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        and  config [ " suicide_intervals " ]  !=  " 0 " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) : 
							 
						 
					
						
							
								
									
										
										
										
											2021-06-01 11:53:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        migrateFromFencedNode ( zkhandler ,  node_name ,  config ,  logger ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-11-07 14:45:24 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								# Migrate hosts away from a fenced node  
						 
					
						
							
								
									
										
										
										
											2021-06-01 11:53:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  migrateFromFencedNode ( zkhandler ,  node_name ,  config ,  logger ) :  
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ' Migrating VMs from dead node  " {} "  to new hosts ' . format ( node_name ) ,  state = " i " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
									
										
										
										
											2020-08-05 22:26:01 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Get the list of VMs 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    dead_node_running_domains  =  zkhandler . read ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ( " node.running_domains " ,  node_name ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) . split ( ) 
							 
						 
					
						
							
								
									
										
										
										
											2020-08-05 22:26:01 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Set the node to a custom domainstate so we know what's happening 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    zkhandler . write ( [ ( ( " node.state.domain " ,  node_name ) ,  " fence-flush " ) ] ) 
							 
						 
					
						
							
								
									
										
										
										
											2020-08-05 22:26:01 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Migrate a VM after a flush 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    def  fence_migrate_vm ( dom_uuid ) : 
							 
						 
					
						
							
								
									
										
										
										
											2021-06-01 11:53:21 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        VMInstance . flush_locks ( zkhandler ,  logger ,  dom_uuid ) 
							 
						 
					
						
							
								
									
										
										
										
											2019-07-09 19:17:53 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-06-08 23:34:49 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        target_node  =  common . findTargetNode ( zkhandler ,  dom_uuid ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-11-23 20:02:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2019-10-12 01:17:39 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        if  target_node  is  not  None : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ' Migrating VM  " {} "  to node  " {} " ' . format ( dom_uuid ,  target_node ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " i " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            zkhandler . write ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                [ 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    ( ( " domain.state " ,  dom_uuid ) ,  " start " ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    ( ( " domain.node " ,  dom_uuid ) ,  target_node ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    ( ( " domain.last_node " ,  dom_uuid ) ,  node_name ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ] 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
										
										
											2019-10-12 01:17:39 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        else : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ' No target node found for VM  " {} " ; VM will autostart on next unflush/ready of current node ' . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    dom_uuid 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " i " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            zkhandler . write ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    ( ( " domain.state " ,  dom_uuid ) ,  " stopped " ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    ( ( " domain.meta.autostart " ,  dom_uuid ) ,  " True " ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                } 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-08-05 22:26:01 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Loop through the VMs 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    for  dom_uuid  in  dead_node_running_domains : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        fence_migrate_vm ( dom_uuid ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    # Set node in flushed state for easy remigrating when it comes back 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    zkhandler . write ( [ ( ( " node.state.domain " ,  node_name ) ,  " flushed " ) ] ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ' All VMs flushed from dead node  " {} "  to new hosts ' . format ( node_name ) ,  state = " i " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-11-07 14:45:24 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								# Perform an IPMI fence  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
									
										
										
										
											2021-08-21 02:46:11 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  reboot_via_ipmi ( ipmi_hostname ,  ipmi_user ,  ipmi_password ,  logger ) :  
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Power off the node the node 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    logger . out ( " Sending power off to dead node " ,  state = " i " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_command_stop  =  ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " /usr/bin/ipmitool -I lanplus -H  {}  -U  {}  -P  {}  chassis power off " . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ipmi_hostname ,  ipmi_user ,  ipmi_password 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_stop_retcode ,  ipmi_stop_stdout ,  ipmi_stop_stderr  =  common . run_os_command ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ipmi_command_stop 
							 
						 
					
						
							
								
									
										
										
										
											2018-10-22 20:20:27 -04:00 
										
									 
								 
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-11-23 20:02:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  ipmi_stop_retcode  !=  0 : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        logger . out ( f " Failed to power off dead node:  { ipmi_stop_stderr } " ,  state = " e " ) 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-10-27 16:24:17 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    time . sleep ( 5 ) 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Check the chassis power state 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    logger . out ( " Checking power state of dead node " ,  state = " i " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_command_status  =  ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " /usr/bin/ipmitool -I lanplus -H  {}  -U  {}  -P  {}  chassis power status " . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ipmi_hostname ,  ipmi_user ,  ipmi_password 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_status_retcode ,  ipmi_status_stdout ,  ipmi_status_stderr  =  common . run_os_command ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ipmi_command_status 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  ipmi_status_retcode  ==  0 : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            f " Current chassis power state is:  { ipmi_status_stdout . strip ( ) } " ,  state = " i " 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    else : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:26:03 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        logger . out ( " Current chassis power state is: Unknown " ,  state = " w " ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    # Power on the node 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    logger . out ( " Sending power on to dead node " ,  state = " i " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_command_start  =  ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " /usr/bin/ipmitool -I lanplus -H  {}  -U  {}  -P  {}  chassis power on " . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ipmi_hostname ,  ipmi_user ,  ipmi_password 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_start_retcode ,  ipmi_start_stdout ,  ipmi_start_stderr  =  common . run_os_command ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ipmi_command_start 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
									
										
										
										
											2020-12-15 02:45:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  ipmi_start_retcode  !=  0 : 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        logger . out ( f " Failed to power on dead node:  { ipmi_start_stderr } " ,  state = " w " ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2019-08-07 11:35:49 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    time . sleep ( 2 ) 
							 
						 
					
						
							
								
									
										
										
										
											2018-11-23 20:02:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    # Check the chassis power state 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    logger . out ( " Checking power state of dead node " ,  state = " i " ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_command_status  =  ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        " /usr/bin/ipmitool -I lanplus -H  {}  -U  {}  -P  {}  chassis power status " . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ipmi_hostname ,  ipmi_user ,  ipmi_password 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    ipmi_status_retcode ,  ipmi_status_stdout ,  ipmi_status_stderr  =  common . run_os_command ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        ipmi_command_status 
							 
						 
					
						
							
								
									
										
										
										
											2018-11-23 20:02:31 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2021-10-12 10:59:09 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  ipmi_stop_retcode  ==  0 : 
							 
						 
					
						
							
								
									
										
										
										
											2021-09-26 20:07:30 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        if  ipmi_status_stdout . strip ( )  ==  " Chassis Power is on " : 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            # We successfully rebooted the node and it is powered on; this is a succeessful fence 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( " Successfully rebooted dead node " ,  state = " o " ) 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            return  True 
							 
						 
					
						
							
								
									
										
										
										
											2021-09-26 20:07:30 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        elif  ipmi_status_stdout . strip ( )  ==  " Chassis Power is off " : 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            # We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " o " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            return  True 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            # We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " Chassis power is in an unknown state ( {} ) after successful IPMI reboot; not performing fence-flush " . format ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                    ipmi_status_stdout . strip ( ) 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                ) , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " e " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            return  False 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    else : 
							 
						 
					
						
							
								
									
										
										
										
											2021-09-26 20:07:30 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        if  ipmi_status_stdout . strip ( )  ==  " Chassis Power is off " : 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            # We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " o " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
											 
										
											
												Rework success checks for IPMI fencing
Previously, if the node failed to restart, it was declared a "bad fence"
and no further action would be taken. However, there are some
situations, for instance critical hardware failures, where intelligent
systems will not attempt (or succeed at) starting up the node in such a
case, which would result in dead, known-offline nodes without recovery.
Tweak this behaviour somewhat. The main path of Reboot -> Check On ->
Success + fence-flush is retained, but some additional side-paths are
now defined:
1. We attempt to power "on" the chassis 1 second after the reboot, just
in case it is off and can be recovered. We then wait another 2 seconds
and check the power status (as we did before).
2. If the reboot succeeded, follow this series of choices:
    a. If the chassis is on, the fence succeeded.
    b. If the chassis is off, the fence "succeeded" as well.
    c. If the chassis is in some other state, the fence failed.
3. If the reboot failed, follow this series of choices:
    a. If the chassis is off, the fence itself failed, but we can treat
    it as "succeeded"" since the chassis is in a known-offline state.
    This is the most likely situation when there is a critical hardware
    failure, and the server's IPMI does not allow itself to start back
    up again.
    b. If the chassis is in any other state ("on" or unknown), the fence
    itself failed and we must treat this as a fence failure.
Overall, this should alleviate the aforementioned issue of a critical
failure rendering the node persistently "off" not triggering a
fence-flush and ensure fencing is more robust.
											 
										 
										
											2021-07-13 17:17:14 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            return  True 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            # We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence 
							 
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            logger . out ( 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                " Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								                state = " e " , 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								            ) 
							 
						 
					
						
							
								
									
										
										
										
											2020-12-15 02:45:38 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								            return  False 
							 
						 
					
						
							
								
									
										
										
										
											2020-08-13 14:38:05 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-11-07 14:45:24 -05:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
									
										
										
										
											2020-08-13 14:38:05 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								# Verify that IPMI connectivity to this host exists (used during node init)  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								#  
						 
					
						
							
								
									
										
										
										
											2021-08-21 02:46:11 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								def  verify_ipmi ( ipmi_hostname ,  ipmi_user ,  ipmi_password ) :  
						 
					
						
							
								
									
										
										
										
											2021-11-06 03:02:43 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    ipmi_command  =  f " /usr/bin/ipmitool -I lanplus -H  { ipmi_hostname }  -U  { ipmi_user }  -P  { ipmi_password }  chassis power status " 
							 
						 
					
						
							
								
									
										
										
										
											2021-08-21 02:46:11 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    retcode ,  stdout ,  stderr  =  common . run_os_command ( ipmi_command ,  timeout = 2 ) 
							 
						 
					
						
							
								
									
										
										
										
											2021-10-07 15:11:19 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								    if  retcode  ==  0  and  stdout . strip ( )  ==  " Chassis Power is on " : 
							 
						 
					
						
							
								
									
										
										
										
											2020-08-13 14:38:05 -04:00 
										
									 
								 
							 
							
								
									
										 
								
							 
							
								 
							
							
								        return  True 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    else : 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        return  False