diff --git a/client-api/api_lib/pvcapi_helper.py b/client-api/api_lib/pvcapi_helper.py index 75ccab55..d78406ab 100755 --- a/client-api/api_lib/pvcapi_helper.py +++ b/client-api/api_lib/pvcapi_helper.py @@ -47,6 +47,7 @@ def initialize_cluster(): transaction = zk_conn.transaction() transaction.create('/primary_node', 'none'.encode('ascii')) transaction.create('/upstream_ip', 'none'.encode('ascii')) + transaction.create('/maintenance', 'False'.encode('ascii')) transaction.create('/nodes', ''.encode('ascii')) transaction.create('/domains', ''.encode('ascii')) transaction.create('/networks', ''.encode('ascii')) @@ -69,7 +70,7 @@ def initialize_cluster(): return True # -# Status function +# Cluster functions # def cluster_status(): """ @@ -81,6 +82,24 @@ def cluster_status(): return retdata, 200 +def cluster_maintenance(maint_state='false'): + """ + Set the cluster in or out of maintenance state + """ + zk_conn = pvc_common.startZKConnection(config['coordinators']) + retflag, retdata = pvc_cluster.set_maintenance(zk_conn, maint_state) + pvc_common.stopZKConnection(zk_conn) + + retdata = { + 'message': retdata + } + if retflag: + retcode = 200 + else: + retcode = 400 + + return retdata, retcode + # # Node functions # diff --git a/client-api/pvc-api.py b/client-api/pvc-api.py index c0fed28d..ab85ac44 100755 --- a/client-api/pvc-api.py +++ b/client-api/pvc-api.py @@ -390,6 +390,37 @@ class API_Status(Resource): description: Bad request """ return api_helper.cluster_status() + + @RequestParser([ + { 'name': 'state', 'choices': ('true', 'false'), 'required': True, 'helpmsg': "A valid state must be specified" } + ]) + @Authenticator + def post(self, reqargs): + """ + Set the cluster maintenance mode + --- + tags: + - node + parameters: + - in: query + name: state + type: boolean + required: true + description: The cluster maintenance state + responses: + 200: + description: OK + schema: + type: object + id: Message + 400: + description: Bad request + schema: + type: object + id: Message + """ + return api_helper.cluster_maintenance(reqargs.get('state', 'false')) + api.add_resource(API_Status, '/status') diff --git a/client-cli/cli_lib/cluster.py b/client-cli/cli_lib/cluster.py index 9fe37af2..96a34280 100644 --- a/client-cli/cli_lib/cluster.py +++ b/client-cli/cli_lib/cluster.py @@ -42,6 +42,26 @@ def initialize(config): return retstatus, response.json()['message'] +def maintenance_mode(config, state): + """ + Enable or disable PVC cluster maintenance mode + + API endpoint: POST /api/v1/status + API arguments: {state}={state} + API schema: {json_data_object} + """ + params = { + 'state': state + } + response = call_api(config, 'post', '/status', params=params) + + if response.status_code == 200: + retstatus = True + else: + retstatus = False + + return retstatus, response.json()['message'] + def get_info(config): """ Get status of the PVC cluster @@ -67,6 +87,8 @@ def format_info(cluster_information, oformat): # Plain formatting, i.e. human-readable if cluster_information['health'] == 'Optimal': health_colour = ansiprint.green() + elif cluster_information['health'] == 'Maintenance': + health_colour = ansiprint.blue() else: health_colour = ansiprint.yellow() diff --git a/client-cli/pvc.py b/client-cli/pvc.py index f02d7c3f..9f68be4e 100755 --- a/client-cli/pvc.py +++ b/client-cli/pvc.py @@ -3054,15 +3054,40 @@ def provisioner_status(job): cleanup(retcode, retdata) +############################################################################### +# pvc maintenance +############################################################################### +@click.group(name='maintenance', short_help='Manage PVC cluster maintenance state.', context_settings=CONTEXT_SETTINGS) +def cli_maintenance(): + """ + Manage the maintenance mode of the PVC cluster. + """ + # Abort commands under this group if config is bad + if config.get('badcfg', None): + click.echo('No cluster specified and no local pvc-api.yaml configuration found. Use "pvc cluster" to add a cluster API to connect to.') + exit(1) +############################################################################### +# pvc maintenance on +############################################################################### +@click.command(name='on', short_help='Enable cluster maintenance mode.') +def maintenance_on(): + """ + Enable maintenance mode on the PVC cluster. + """ + retcode, retdata = pvc_cluster.maintenance_mode(config, 'true') + cleanup(retcode, retdata) - - - - - - - +############################################################################### +# pvc maintenance off +############################################################################### +@click.command(name='off', short_help='Disable cluster maintenance mode.') +def maintenance_off(): + """ + Disable maintenance mode on the PVC cluster. + """ + retcode, retdata = pvc_cluster.maintenance_mode(config, 'false') + cleanup(retcode, retdata) ############################################################################### @@ -3291,12 +3316,16 @@ cli_provisioner.add_command(provisioner_profile) cli_provisioner.add_command(provisioner_create) cli_provisioner.add_command(provisioner_status) +cli_maintenance.add_command(maintenance_on) +cli_maintenance.add_command(maintenance_off) + cli.add_command(cli_cluster) cli.add_command(cli_node) cli.add_command(cli_vm) cli.add_command(cli_network) cli.add_command(cli_storage) cli.add_command(cli_provisioner) +cli.add_command(cli_maintenance) cli.add_command(status_cluster) cli.add_command(init_cluster) diff --git a/client-common/cluster.py b/client-common/cluster.py index fca2ecfa..bb2477f0 100644 --- a/client-common/cluster.py +++ b/client-common/cluster.py @@ -20,9 +20,10 @@ # ############################################################################### -import click import json +from distutils.util import strtobool + import client_lib.ansiprint as ansiprint import client_lib.zkhandler as zkhandler import client_lib.common as common @@ -31,7 +32,24 @@ import client_lib.node as pvc_node import client_lib.network as pvc_network import client_lib.ceph as pvc_ceph +def set_maintenance(zk_conn, maint_state): + try: + if maint_state == 'true': + zkhandler.writedata(zk_conn, {'/maintenance': 'true'}) + return True, 'Successfully set cluster in maintenance mode' + else: + zkhandler.writedata(zk_conn, {'/maintenance': 'false'}) + return True, 'Successfully set cluster in normal mode' + except: + return False, 'Failed to set cluster maintenance state' + def getClusterInformation(zk_conn): + # Get cluster maintenance state + try: + maint_state = zkhandler.readdata(zk_conn, '/maintenance') + except: + maint_state = 'false' + # Get node information object list retcode, node_list = pvc_node.get_list(zk_conn, None) @@ -102,7 +120,9 @@ def getClusterInformation(zk_conn): ceph_osd_report_status[index] = up_texts[ceph_osd_up] + ',' + in_texts[ceph_osd_in] # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy - if False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status: + if maint_state == 'true': + cluster_health = 'Maintenance' + elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status: cluster_health = 'Degraded' else: cluster_health = 'Optimal' @@ -173,75 +193,3 @@ def get_info(zk_conn): return True, cluster_information else: return False, 'ERROR: Failed to obtain cluster information!' - -def format_info(cluster_information, oformat): - if oformat == 'json': - print(json.dumps(cluster_information)) - return - - if oformat == 'json-pretty': - print(json.dumps(cluster_information, indent=4)) - return - - # Plain formatting, i.e. human-readable - if cluster_information['health'] == 'Optimal': - health_colour = ansiprint.green() - else: - health_colour = ansiprint.yellow() - - ainformation = [] - ainformation.append('{}PVC cluster status:{}'.format(ansiprint.bold(), ansiprint.end())) - ainformation.append('') - ainformation.append('{}Cluster health:{} {}{}{}'.format(ansiprint.purple(), ansiprint.end(), health_colour, cluster_information['health'], ansiprint.end())) - ainformation.append('{}Primary node:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['primary_node'])) - ainformation.append('{}Cluster upstream IP:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['upstream_ip'])) - ainformation.append('') - ainformation.append('{}Total nodes:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['nodes']['total'])) - ainformation.append('{}Total VMs:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['vms']['total'])) - ainformation.append('{}Total networks:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['networks'])) - ainformation.append('{}Total OSDs:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['osds']['total'])) - ainformation.append('{}Total pools:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['pools'])) - ainformation.append('{}Total volumes:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['volumes'])) - ainformation.append('{}Total snapshots:{} {}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['snapshots'])) - - nodes_string = '{}Nodes:{} {}/{} {}ready,run{}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['nodes']['run,ready'], cluster_information['nodes']['total'], ansiprint.green(), ansiprint.end()) - for state, count in cluster_information['nodes'].items(): - if state == 'total' or state == 'run,ready': - continue - - nodes_string += ' {}/{} {}{}{}'.format(count, cluster_information['nodes']['total'], ansiprint.yellow(), state, ansiprint.end()) - - ainformation.append('') - ainformation.append(nodes_string) - - vms_string = '{}VMs:{} {}/{} {}start{}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['vms']['start'], cluster_information['vms']['total'], ansiprint.green(), ansiprint.end()) - for state, count in cluster_information['vms'].items(): - if state == 'total' or state == 'start': - continue - - if state == 'disable': - colour = ansiprint.blue() - else: - colour = ansiprint.yellow() - - vms_string += ' {}/{} {}{}{}'.format(count, cluster_information['vms']['total'], colour, state, ansiprint.end()) - - ainformation.append('') - ainformation.append(vms_string) - - if cluster_information['osds']['total'] > 0: - osds_string = '{}Ceph OSDs:{} {}/{} {}up,in{}'.format(ansiprint.purple(), ansiprint.end(), cluster_information['osds']['up,in'], cluster_information['osds']['total'], ansiprint.green(), ansiprint.end()) - for state, count in cluster_information['osds'].items(): - if state == 'total' or state == 'up,in': - continue - - osds_string += ' {}/{} {}{}{}'.format(count, cluster_information['osds']['total'], ansiprint.yellow(), state, ansiprint.end()) - - ainformation.append('') - ainformation.append(osds_string) - - information = '\n'.join(ainformation) - click.echo(information) - - click.echo('') - diff --git a/debian/control b/debian/control index 7dc27004..3e4bb7c3 100644 --- a/debian/control +++ b/debian/control @@ -8,7 +8,7 @@ X-Python3-Version: >= 3.2 Package: pvc-daemon Architecture: all -Depends: systemd, pvc-client-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql +Depends: systemd, pvc-client-common, python3-kazoo, python3-psutil, python3-apscheduler, python3-libvirt, python3-psycopg2, python3-dnspython, python3-yaml, python3-distutils, ipmitool, libvirt-daemon-system, arping, vlan, bridge-utils, dnsmasq, nftables, pdns-server, pdns-backend-pgsql Suggests: pvc-client-api, pvc-client-cli Description: Parallel Virtual Cluster virtualization daemon (Python 3) A KVM/Zookeeper/Ceph-based VM and private cloud manager diff --git a/docs/manuals/swagger.json b/docs/manuals/swagger.json index c5d244bf..6922dee5 100644 --- a/docs/manuals/swagger.json +++ b/docs/manuals/swagger.json @@ -3653,6 +3653,36 @@ "tags": [ "root" ] + }, + "post": { + "description": "", + "parameters": [ + { + "description": "The cluster maintenance state", + "in": "query", + "name": "state", + "required": true, + "type": "boolean" + } + ], + "responses": { + "200": { + "description": "OK", + "schema": { + "$ref": "#/definitions/Message" + } + }, + "400": { + "description": "Bad request", + "schema": { + "$ref": "#/definitions/Message" + } + } + }, + "summary": "Set the cluster maintenance mode", + "tags": [ + "node" + ] } }, "/api/v1/storage/ceph/option": { diff --git a/node-daemon/pvcd/Daemon.py b/node-daemon/pvcd/Daemon.py index 0640e77e..583a922b 100644 --- a/node-daemon/pvcd/Daemon.py +++ b/node-daemon/pvcd/Daemon.py @@ -42,6 +42,8 @@ import json import ipaddress import apscheduler.schedulers.background +from distutils.util import strtobool + import pvcd.log as log import pvcd.zkhandler as zkhandler import pvcd.fencing as fencing @@ -112,6 +114,9 @@ try: except IndexError: mynodeid = 1 +# Maintenance mode off by default +maintenance = False + # Gather useful data about our host # Static data format: 'cpu_count', 'arch', 'os', 'kernel' staticdata = [] @@ -771,6 +776,15 @@ def update_nodes(new_node_list): # Alias for our local node (passed to network and domain objects) this_node = d_node[myhostname] +# Maintenance mode +@zk_conn.DataWatch('/maintenance') +def set_maintenance(_maintenance, stat, event=''): + global maintenance + try: + maintenance = bool(strtobool(_maintenance.decode('ascii'))) + except: + maintenance = False + # Primary node @zk_conn.DataWatch('/primary_node') def update_primary(new_primary, stat, event=''): @@ -1271,34 +1285,35 @@ def update_zookeeper(): lv_conn.close() # Look for dead nodes and fence them - if debug: - print("Look for dead nodes and fence them") - if config['daemon_mode'] == 'coordinator': - for node_name in d_node: - try: - node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) - node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name)) - node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name))) - except: - node_daemon_state = 'unknown' - node_domain_state = 'unknown' - node_keepalive = 0 - - # Handle deadtime and fencng if needed - # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds - # out-of-date while in 'start' state) - node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) ) - if node_keepalive < node_deadtime and node_daemon_state == 'run': - logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') - zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) - with zk_lock: - # Ensures that, if we lost the lock race and come out of waiting, - # we won't try to trigger our own fence thread. - if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead': - fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={}) - fence_thread.start() - # Write the updated data after we start the fence thread - zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' }) + if not maintenance: + if debug: + print("Look for dead nodes and fence them") + if config['daemon_mode'] == 'coordinator': + for node_name in d_node: + try: + node_daemon_state = zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) + node_domain_state = zkhandler.readdata(zk_conn, '/nodes/{}/domainstate'.format(node_name)) + node_keepalive = int(zkhandler.readdata(zk_conn, '/nodes/{}/keepalive'.format(node_name))) + except: + node_daemon_state = 'unknown' + node_domain_state = 'unknown' + node_keepalive = 0 + + # Handle deadtime and fencng if needed + # (A node is considered dead when its keepalive timer is >6*keepalive_interval seconds + # out-of-date while in 'start' state) + node_deadtime = int(time.time()) - ( int(config['keepalive_interval']) * int(config['fence_intervals']) ) + if node_keepalive < node_deadtime and node_daemon_state == 'run': + logger.out('Node {} seems dead - starting monitor for fencing'.format(node_name), state='w') + zk_lock = zkhandler.writelock(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) + with zk_lock: + # Ensures that, if we lost the lock race and come out of waiting, + # we won't try to trigger our own fence thread. + if zkhandler.readdata(zk_conn, '/nodes/{}/daemonstate'.format(node_name)) != 'dead': + fence_thread = threading.Thread(target=fencing.fenceNode, args=(node_name, zk_conn, config, logger), kwargs={}) + fence_thread.start() + # Write the updated data after we start the fence thread + zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(node_name): 'dead' }) # Display node information to the terminal if config['log_keepalives']: @@ -1321,6 +1336,7 @@ def update_zookeeper(): ) if config['log_keepalive_cluster_details']: logger.out( + '{bold}Maintenance:{nofmt} {maint} ' '{bold}Active VMs:{nofmt} {domcount} ' '{bold}Networks:{nofmt} {netcount} ' '{bold}Load:{nofmt} {load} ' @@ -1329,6 +1345,7 @@ def update_zookeeper(): '{bold}Free:{nofmt} {freemem}'.format( bold=fmt_bold, nofmt=fmt_end, + maint=maintenance, domcount=this_node.domains_count, netcount=len(network_list), load=this_node.cpuload,