Implements a "maintenance mode" for PVC clusters. For now, the only thing this mode does is disable node fencing while the state is true. This allows the administrator to tell PVC that network connectivity, etc. might be interrupted and to avoid fencing nodes. Closes #70
196 lines
7.1 KiB
Python
196 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
|
|
# cluster.py - PVC client function library, cluster management
|
|
# Part of the Parallel Virtual Cluster (PVC) system
|
|
#
|
|
# Copyright (C) 2018-2020 Joshua M. Boniface <joshua@boniface.me>
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
#
|
|
###############################################################################
|
|
|
|
import json
|
|
|
|
from distutils.util import strtobool
|
|
|
|
import client_lib.ansiprint as ansiprint
|
|
import client_lib.zkhandler as zkhandler
|
|
import client_lib.common as common
|
|
import client_lib.vm as pvc_vm
|
|
import client_lib.node as pvc_node
|
|
import client_lib.network as pvc_network
|
|
import client_lib.ceph as pvc_ceph
|
|
|
|
def set_maintenance(zk_conn, maint_state):
|
|
try:
|
|
if maint_state == 'true':
|
|
zkhandler.writedata(zk_conn, {'/maintenance': 'true'})
|
|
return True, 'Successfully set cluster in maintenance mode'
|
|
else:
|
|
zkhandler.writedata(zk_conn, {'/maintenance': 'false'})
|
|
return True, 'Successfully set cluster in normal mode'
|
|
except:
|
|
return False, 'Failed to set cluster maintenance state'
|
|
|
|
def getClusterInformation(zk_conn):
|
|
# Get cluster maintenance state
|
|
try:
|
|
maint_state = zkhandler.readdata(zk_conn, '/maintenance')
|
|
except:
|
|
maint_state = 'false'
|
|
|
|
# Get node information object list
|
|
retcode, node_list = pvc_node.get_list(zk_conn, None)
|
|
|
|
# Get vm information object list
|
|
retcode, vm_list = pvc_vm.get_list(zk_conn, None, None, None)
|
|
|
|
# Get network information object list
|
|
retcode, network_list = pvc_network.get_list(zk_conn, None, None)
|
|
|
|
# Get storage information object list
|
|
retcode, ceph_osd_list = pvc_ceph.get_list_osd(zk_conn, None)
|
|
retcode, ceph_pool_list = pvc_ceph.get_list_pool(zk_conn, None)
|
|
retcode, ceph_volume_list = pvc_ceph.get_list_volume(zk_conn, None, None)
|
|
retcode, ceph_snapshot_list = pvc_ceph.get_list_snapshot(zk_conn, None, None, None)
|
|
|
|
# Determine, for each subsection, the total count
|
|
node_count = len(node_list)
|
|
vm_count = len(vm_list)
|
|
network_count = len(network_list)
|
|
ceph_osd_count = len(ceph_osd_list)
|
|
ceph_pool_count = len(ceph_pool_list)
|
|
ceph_volume_count = len(ceph_volume_list)
|
|
ceph_snapshot_count = len(ceph_snapshot_list)
|
|
|
|
# Determinations for node health
|
|
node_healthy_status = list(range(0, node_count))
|
|
node_report_status = list(range(0, node_count))
|
|
for index, node in enumerate(node_list):
|
|
daemon_state = node['daemon_state']
|
|
domain_state = node['domain_state']
|
|
if daemon_state != 'run' and domain_state != 'ready':
|
|
node_healthy_status[index] = False
|
|
else:
|
|
node_healthy_status[index] = True
|
|
node_report_status[index] = daemon_state + ',' + domain_state
|
|
|
|
# Determinations for VM health
|
|
vm_healthy_status = list(range(0, vm_count))
|
|
vm_report_status = list(range(0, vm_count))
|
|
for index, vm in enumerate(vm_list):
|
|
vm_state = vm['state']
|
|
if vm_state not in ['start', 'disable', 'migrate', 'unmigrate', 'provision']:
|
|
vm_healthy_status[index] = False
|
|
else:
|
|
vm_healthy_status[index] = True
|
|
vm_report_status[index] = vm_state
|
|
|
|
# Determinations for OSD health
|
|
ceph_osd_healthy_status = list(range(0, ceph_osd_count))
|
|
ceph_osd_report_status = list(range(0, ceph_osd_count))
|
|
for index, ceph_osd in enumerate(ceph_osd_list):
|
|
try:
|
|
ceph_osd_up = ceph_osd['stats']['up']
|
|
except KeyError:
|
|
ceph_osd_up = 0
|
|
|
|
try:
|
|
ceph_osd_in = ceph_osd['stats']['in']
|
|
except KeyError:
|
|
ceph_osd_in = 0
|
|
|
|
if not ceph_osd_up or not ceph_osd_in:
|
|
ceph_osd_healthy_status[index] = False
|
|
else:
|
|
ceph_osd_healthy_status[index] = True
|
|
up_texts = { 1: 'up', 0: 'down' }
|
|
in_texts = { 1: 'in', 0: 'out' }
|
|
ceph_osd_report_status[index] = up_texts[ceph_osd_up] + ',' + in_texts[ceph_osd_in]
|
|
|
|
# Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy
|
|
if maint_state == 'true':
|
|
cluster_health = 'Maintenance'
|
|
elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status:
|
|
cluster_health = 'Degraded'
|
|
else:
|
|
cluster_health = 'Optimal'
|
|
|
|
# State lists
|
|
node_state_combinations = [
|
|
'run,ready', 'run,flush', 'run,flushed', 'run,unflush',
|
|
'init,ready', 'init,flush', 'init,flushed', 'init,unflush',
|
|
'stop,ready', 'stop,flush', 'stop,flushed', 'stop,unflush'
|
|
]
|
|
vm_state_combinations = [
|
|
'start', 'restart', 'shutdown', 'stop', 'disable', 'fail', 'migrate', 'unmigrate', 'provision'
|
|
]
|
|
ceph_osd_state_combinations = [
|
|
'up,in', 'up,out', 'down,in', 'down,out'
|
|
]
|
|
|
|
# Format the Node states
|
|
formatted_node_states = {'total': node_count}
|
|
for state in node_state_combinations:
|
|
state_count = 0
|
|
for node_state in node_report_status:
|
|
if node_state == state:
|
|
state_count += 1
|
|
if state_count > 0:
|
|
formatted_node_states[state] = state_count
|
|
|
|
# Format the VM states
|
|
formatted_vm_states = {'total': vm_count}
|
|
for state in vm_state_combinations:
|
|
state_count = 0
|
|
for vm_state in vm_report_status:
|
|
if vm_state == state:
|
|
state_count += 1
|
|
if state_count > 0:
|
|
formatted_vm_states[state] = state_count
|
|
|
|
# Format the OSD states
|
|
formatted_osd_states = {'total': ceph_osd_count}
|
|
for state in ceph_osd_state_combinations:
|
|
state_count = 0
|
|
for ceph_osd_state in ceph_osd_report_status:
|
|
if ceph_osd_state == state:
|
|
state_count += 1
|
|
if state_count > 0:
|
|
formatted_osd_states[state] = state_count
|
|
|
|
# Format the status data
|
|
cluster_information = {
|
|
'health': cluster_health,
|
|
'primary_node': common.getPrimaryNode(zk_conn),
|
|
'upstream_ip': zkhandler.readdata(zk_conn, '/upstream_ip'),
|
|
'nodes': formatted_node_states,
|
|
'vms': formatted_vm_states,
|
|
'networks': network_count,
|
|
'osds': formatted_osd_states,
|
|
'pools': ceph_pool_count,
|
|
'volumes': ceph_volume_count,
|
|
'snapshots': ceph_snapshot_count
|
|
}
|
|
|
|
return cluster_information
|
|
|
|
def get_info(zk_conn):
|
|
# This is a thin wrapper function for naming purposes
|
|
cluster_information = getClusterInformation(zk_conn)
|
|
if cluster_information:
|
|
return True, cluster_information
|
|
else:
|
|
return False, 'ERROR: Failed to obtain cluster information!'
|