From 672e58133f06e6320c47b248605ac880c3099f7c Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Mon, 4 Dec 2023 01:37:54 -0500 Subject: [PATCH] Implement interfaces to faults --- api-daemon/pvcapid/flaskapi.py | 146 +++++++++++++++++++++++++++++ api-daemon/pvcapid/helper.py | 60 ++++++++++++ client-cli/pvc/cli/cli.py | 155 ++++++++++++++++++++++++------- client-cli/pvc/cli/formatters.py | 121 ++++++++++++++++++++++++ client-cli/pvc/lib/faults.py | 78 ++++++++++++++++ daemon-common/faults.py | 126 +++++++++++++++++++++++++ 6 files changed, 651 insertions(+), 35 deletions(-) create mode 100644 client-cli/pvc/lib/faults.py create mode 100644 daemon-common/faults.py diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 014ebbf5..1eb2d2f1 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -622,6 +622,152 @@ class API_Status(Resource): api.add_resource(API_Status, "/status") +# /faults +class API_Faults(Resource): + @RequestParser( + [ + { + "name": "sort_key", + "choices": ( + "first_reported", + "last_reported", + "acknowledged_at", + "status", + "health_delta", + "message", + ), + "helptext": "A valid sort key must be specified", + "required": False, + }, + ] + ) + @Authenticator + def get(self, reqargs): + """ + Return a list of cluster faults + --- + tags: + - faults + parameters: + - in: query + name: sort_key + type: string + required: false + description: The fault object key to sort results by + enum: + - first_reported + - last_reported + - acknowledged_at + - status + - health_delta + - message + responses: + 200: + description: OK + schema: + type: array + items: + type: object + id: fault + properties: + id: + type: string + description: The ID of the fault + example: "10ae144b78b4cc5fdf09e2ebbac51235" + first_reported: + type: date + description: The first time the fault was reported + example: "2023-12-01 16:47:59.849742" + last_reported: + type: date + description: The last time the fault was reported + example: "2023-12-01 17:39:45.188398" + acknowledged_at: + type: date + description: The time the fault was acknowledged, or empty if not acknowledged + example: "2023-12-01 17:50:00.000000" + status: + type: string + description: The current state of the fault, either "new" or "ack" (acknowledged) + example: "new" + health_delta: + type: integer + description: The health delta (amount it reduces cluster health from 100%) of the fault + example: 25 + message: + type: string + description: The textual description of the fault + example: "Node hv1 was at 40% (psur@-10%, psql@-50%) <= 50% health" + """ + return api_helper.fault_list(sort_key=reqargs.get("sort_key", "last_reported")) + + +api.add_resource(API_Faults, "/faults") + + +# /faults/ +class API_Faults_Element(Resource): + @Authenticator + def get(self, fault_id): + """ + Return a single cluster fault + --- + tags: + - faults + responses: + 200: + description: OK + schema: + type: array + items: + type: object + id: fault + $ref: '#/definitions/fault' + """ + return api_helper.fault_list(limit=fault_id) + + @Authenticator + def put(self, fault_id): + """ + Acknowledge a cluster fault + --- + tags: + - faults + responses: + 200: + description: OK + schema: + type: object + properties: + message: + type: string + description: A text message + """ + return api_helper.fault_acknowledge(fault_id) + + @Authenticator + def delete(self, fault_id): + """ + Delete a cluster fault + --- + tags: + - faults + responses: + 200: + description: OK + schema: + type: object + properties: + message: + type: string + description: A text message + """ + return api_helper.fault_delete(fault_id) + + +api.add_resource(API_Faults_Element, "/faults/") + + # /tasks class API_Tasks(Resource): @Authenticator diff --git a/api-daemon/pvcapid/helper.py b/api-daemon/pvcapid/helper.py index b859336a..8169f2c0 100755 --- a/api-daemon/pvcapid/helper.py +++ b/api-daemon/pvcapid/helper.py @@ -31,6 +31,7 @@ from daemon_lib.zkhandler import ZKConnection import daemon_lib.common as pvc_common import daemon_lib.cluster as pvc_cluster +import daemon_lib.faults as pvc_faults import daemon_lib.node as pvc_node import daemon_lib.vm as pvc_vm import daemon_lib.network as pvc_network @@ -118,6 +119,65 @@ def cluster_maintenance(zkhandler, maint_state="false"): return retdata, retcode +# +# Fault functions +# +@pvc_common.Profiler(config) +@ZKConnection(config) +def fault_list(zkhandler, limit=None, sort_key="last_reported"): + """ + Return a list of all faults sorted by SORT_KEY. + """ + retflag, retdata = pvc_faults.get_list(zkhandler, limit=limit, sort_key=sort_key) + + if retflag: + retcode = 200 + elif retflag and limit is not None and len(retdata) < 1: + retcode = 404 + retdata = {"message": f"No fault with ID {limit} found"} + else: + retcode = 400 + retdata = {"message": retdata} + + return retdata, retcode + + +@pvc_common.Profiler(config) +@ZKConnection(config) +def fault_acknowledge(zkhandler, fault_id): + """ + Acknowledge a fault of FAULT_ID. + """ + retflag, retdata = pvc_faults.acknowledge(zkhandler, fault_id) + + if retflag: + retcode = 200 + else: + retcode = 404 + + retdata = {"message": retdata} + + return retdata, retcode + + +@pvc_common.Profiler(config) +@ZKConnection(config) +def fault_delete(zkhandler, fault_id): + """ + Delete a fault of FAULT_ID. + """ + retflag, retdata = pvc_faults.delete(zkhandler, fault_id) + + if retflag: + retcode = 200 + else: + retcode = 404 + + retdata = {"message": retdata} + + return retdata, retcode + + # # Node functions # diff --git a/client-cli/pvc/cli/cli.py b/client-cli/pvc/cli/cli.py index 25340996..19422013 100644 --- a/client-cli/pvc/cli/cli.py +++ b/client-cli/pvc/cli/cli.py @@ -37,6 +37,7 @@ from pvc.cli.parsers import * from pvc.cli.formatters import * import pvc.lib.cluster +import pvc.lib.faults import pvc.lib.node import pvc.lib.vm import pvc.lib.network @@ -347,40 +348,6 @@ def cli_cluster(): pass -############################################################################### -# > pvc cluster status -############################################################################### -@click.command( - name="status", - short_help="Show cluster status.", -) -@connection_req -@format_opt( - { - "pretty": cli_cluster_status_format_pretty, - "short": cli_cluster_status_format_short, - "json": lambda d: jdumps(d), - "json-pretty": lambda d: jdumps(d, indent=2), - } -) -def cli_cluster_status( - format_function, -): - """ - Show information and health about a PVC cluster. - - \b - Format options: - "pretty": Output all details in a nice colourful format. - "short" Output only details about cluster health in a nice colourful format. - "json": Output in unformatted JSON. - "json-pretty": Output in formatted JSON. - """ - - retcode, retdata = pvc.lib.cluster.get_info(CLI_CONFIG) - finish(retcode, retdata, format_function) - - ############################################################################### # > pvc cluster init ############################################################################### @@ -485,6 +452,120 @@ def cli_cluster_restore( """ +############################################################################### +# > pvc cluster status +############################################################################### +@click.command( + name="status", + short_help="Show cluster status.", +) +@connection_req +@format_opt( + { + "pretty": cli_cluster_status_format_pretty, + "short": cli_cluster_status_format_short, + "json": lambda d: jdumps(d), + "json-pretty": lambda d: jdumps(d, indent=2), + } +) +def cli_cluster_status( + format_function, +): + """ + Show information and health about a PVC cluster. + + \b + Format options: + "pretty": Output all details in a nice colourful format. + "short" Output only details about cluster health in a nice colourful format. + "json": Output in unformatted JSON. + "json-pretty": Output in formatted JSON. + """ + + retcode, retdata = pvc.lib.cluster.get_info(CLI_CONFIG) + finish(retcode, retdata, format_function) + + +############################################################################### +# > pvc cluster fault +############################################################################### +@click.group( + name="fault", + short_help="Manage PVC cluster faults.", + context_settings=CONTEXT_SETTINGS, +) +def cli_cluster_fault(): + """ + Manage faults in the PVC cluster. + """ + pass + + +############################################################################### +# > pvc cluster fault list +############################################################################### +@click.command( + name="list", + short_help="List all cluster faults.", +) +@click.argument("limit", default=None, required=False) +@format_opt( + { + "pretty": cli_cluster_fault_list_format_pretty, + # "short": cli_cluster_status_format_short, + "json": lambda d: jdumps(d), + "json-pretty": lambda d: jdumps(d, indent=2), + } +) +@connection_req +def cli_cluster_fault_list(limit, format_function): + """ + List all faults in the PVC cluster, optionally limited to fault ID LIMIT. + """ + + retcode, retdata = pvc.lib.faults.get_list( + CLI_CONFIG, + limit=limit, + ) + finish(retcode, retdata, format_function) + + +############################################################################### +# > pvc cluster fault ack +############################################################################### +@click.command( + name="ack", + short_help="Acknowledge a cluster fault.", +) +@click.argument("fault_id") +@connection_req +def cli_cluster_fault_acknowledge(fault_id): + """ + Acknowledge the cluster fault FAULT_ID. + """ + + retcode, retdata = pvc.lib.faults.acknowledge(CLI_CONFIG, fault_id) + finish(retcode, retdata) + + +############################################################################### +# > pvc cluster fault delete +############################################################################### +@click.command( + name="delete", + short_help="Delete a cluster fault.", +) +@click.argument("fault_id") +@connection_req +def cli_cluster_fault_delete(fault_id): + """ + Delete the cluster fault FAULT_ID. + """ + + retcode, retdata = pvc.lib.faults.delete(CLI_CONFIG, fault_id) + finish(retcode, retdata) + + ############################################################################### # > pvc cluster maintenance ############################################################################### @@ -6170,10 +6251,14 @@ cli_provisioner_profile.add_command(cli_provisioner_profile_list) cli_provisioner.add_command(cli_provisioner_profile) cli_provisioner.add_command(cli_provisioner_create) cli.add_command(cli_provisioner) -cli_cluster.add_command(cli_cluster_status) cli_cluster.add_command(cli_cluster_init) cli_cluster.add_command(cli_cluster_backup) cli_cluster.add_command(cli_cluster_restore) +cli_cluster.add_command(cli_cluster_status) +cli_cluster_fault.add_command(cli_cluster_fault_list) +cli_cluster_fault.add_command(cli_cluster_fault_acknowledge) +cli_cluster_fault.add_command(cli_cluster_fault_delete) +cli_cluster.add_command(cli_cluster_fault) cli_cluster_maintenance.add_command(cli_cluster_maintenance_on) cli_cluster_maintenance.add_command(cli_cluster_maintenance_off) cli_cluster.add_command(cli_cluster_maintenance) diff --git a/client-cli/pvc/cli/formatters.py b/client-cli/pvc/cli/formatters.py index 791f7c21..3e6cefe8 100644 --- a/client-cli/pvc/cli/formatters.py +++ b/client-cli/pvc/cli/formatters.py @@ -261,6 +261,127 @@ def cli_cluster_status_format_short(CLI_CONFIG, data): return "\n".join(output) +def cli_cluster_fault_list_format_pretty(CLI_CONFIG, fault_data): + """ + Pretty format the output of cli_cluster_fault_list + """ + + fault_list_output = [] + + # Determine optimal column widths + fault_id_length = 3 # "ID" + fault_status_length = 7 # "Status" + fault_health_delta_length = 7 # "Health" + fault_acknowledged_at_length = 6 # "Ack'd" + fault_last_reported_length = 5 # "Last" + fault_first_reported_length = 6 # "First" + # Message goes on its own line + + for fault in fault_data: + # fault_id column + _fault_id_length = len(str(fault["id"])) + 1 + if _fault_id_length > fault_id_length: + fault_id_length = _fault_id_length + + # status column + _fault_status_length = len(str(fault["status"])) + 1 + if _fault_status_length > fault_status_length: + fault_status_length = _fault_status_length + + # health_delta column + _fault_health_delta_length = len(str(fault["health_delta"])) + 1 + if _fault_health_delta_length > fault_health_delta_length: + fault_health_delta_length = _fault_health_delta_length + + # acknowledged_at column + _fault_acknowledged_at_length = len(str(fault["acknowledged_at"])) + 1 + if _fault_acknowledged_at_length > fault_acknowledged_at_length: + fault_acknowledged_at_length = _fault_acknowledged_at_length + + # last_reported column + _fault_last_reported_length = len(str(fault["last_reported"])) + 1 + if _fault_last_reported_length > fault_last_reported_length: + fault_last_reported_length = _fault_last_reported_length + + # first_reported column + _fault_first_reported_length = len(str(fault["first_reported"])) + 1 + if _fault_first_reported_length > fault_first_reported_length: + fault_first_reported_length = _fault_first_reported_length + + # Format the string (header) + fault_list_output.append( + "{bold}{fault_id: <{fault_id_length}} {fault_status: <{fault_status_length}} {fault_health_delta: <{fault_health_delta_length}} {fault_acknowledged_at: <{fault_acknowledged_at_length}} {fault_last_reported: <{fault_last_reported_length}} {fault_first_reported: <{fault_first_reported_length}}{end_bold}".format( + bold=ansii["bold"], + end_bold=ansii["end"], + fault_id_length=fault_id_length, + fault_status_length=fault_status_length, + fault_health_delta_length=fault_health_delta_length, + fault_acknowledged_at_length=fault_acknowledged_at_length, + fault_last_reported_length=fault_last_reported_length, + fault_first_reported_length=fault_first_reported_length, + fault_id="ID", + fault_status="Status", + fault_health_delta="Health", + fault_acknowledged_at="Ack'd", + fault_last_reported="Last", + fault_first_reported="First", + ) + ) + fault_list_output.append( + "{bold}> {fault_message}{end_bold}".format( + bold=ansii["bold"], + end_bold=ansii["end"], + fault_message="Message", + ) + ) + + for fault in sorted( + fault_data, + key=lambda x: (x["status"], x["health_delta"], x["last_reported"]), + reverse=True, + ): + health_delta = fault["health_delta"] + if fault["acknowledged_at"] != "": + health_colour = ansii["blue"] + elif health_delta >= 50: + health_colour = ansii["red"] + elif health_delta >= 10: + health_colour = ansii["yellow"] + else: + health_colour = ansii["green"] + + fault_list_output.append("") + fault_list_output.append( + "{bold}{fault_id: <{fault_id_length}} {health_colour}{fault_status: <{fault_status_length}} {fault_health_delta: <{fault_health_delta_length}}{end_colour} {fault_acknowledged_at: <{fault_acknowledged_at_length}} {fault_last_reported: <{fault_last_reported_length}} {fault_first_reported: <{fault_first_reported_length}}{end_bold}".format( + bold="", + end_bold="", + health_colour=health_colour, + end_colour=ansii["end"], + fault_id_length=fault_id_length, + fault_status_length=fault_status_length, + fault_health_delta_length=fault_health_delta_length, + fault_acknowledged_at_length=fault_acknowledged_at_length, + fault_last_reported_length=fault_last_reported_length, + fault_first_reported_length=fault_first_reported_length, + fault_id=fault["id"], + fault_status=fault["status"].title(), + fault_health_delta=f"-{fault['health_delta']}%", + fault_acknowledged_at=fault["acknowledged_at"] + if fault["acknowledged_at"] != "" + else "N/A", + fault_last_reported=fault["last_reported"], + fault_first_reported=fault["first_reported"], + ) + ) + fault_list_output.append( + "> {fault_message}".format( + fault_message=fault["message"], + ) + ) + + return "\n".join(fault_list_output) + + def cli_cluster_task_format_pretty(CLI_CONFIG, task_data): """ Pretty format the output of cli_cluster_task diff --git a/client-cli/pvc/lib/faults.py b/client-cli/pvc/lib/faults.py new file mode 100644 index 00000000..f8bc0cc4 --- /dev/null +++ b/client-cli/pvc/lib/faults.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# faults.py - PVC CLI client function library, faults management +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +from pvc.lib.common import call_api + + +def get_list(config, limit=None, sort_key="last_reported"): + """ + Get list of PVC faults + + API endpoint: GET /api/v1/faults + API arguments: sort_key={sort_key} + API schema: {json_data_object} + """ + if limit is not None: + params = {} + endpoint = f"/faults/{limit}" + else: + params = {"sort_key": sort_key} + endpoint = "/faults" + + response = call_api(config, "get", endpoint, params=params) + + if response.status_code == 200: + return True, response.json() + else: + return False, response.json().get("message", "") + + +def acknowledge(config, fault_id): + """ + Acknowledge a PVC fault + + API endpoint: PUT /api/v1/faults/ + API arguments: + API schema: {json_message} + """ + response = call_api(config, "put", f"/faults/{fault_id}") + + print(response.json()) + if response.status_code == 200: + return True, response.json().get("message", "") + else: + return False, response.json().get("message", "") + + +def delete(config, fault_id): + """ + Delete a PVC fault + + API endpoint: DELETE /api/v1/faults/ + API arguments: + API schema: {json_message} + """ + response = call_api(config, "delete", f"/faults/{fault_id}") + + if response.status_code == 200: + return True, response.json().get("message", "") + else: + return False, response.json().get("message", "") diff --git a/daemon-common/faults.py b/daemon-common/faults.py new file mode 100644 index 00000000..f8a9d0e0 --- /dev/null +++ b/daemon-common/faults.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +# faults.py - PVC client function library, faults management +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2022 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +from datetime import datetime + + +def getFault(zkhandler, fault_id): + """ + Get the details of a fault based on the fault ID + """ + if not zkhandler.exists(("faults.id", fault_id)): + return None + + fault_id = fault_id + fault_last_time = zkhandler.read(("faults.last_time", fault_id)) + fault_first_time = zkhandler.read(("faults.first_time", fault_id)) + fault_ack_time = zkhandler.read(("faults.ack_time", fault_id)) + fault_status = zkhandler.read(("faults.status", fault_id)) + fault_delta = int(zkhandler.read(("faults.delta", fault_id))) + fault_message = zkhandler.read(("faults.message", fault_id)) + + fault = { + "id": fault_id, + "last_reported": fault_last_time, + "first_reported": fault_first_time, + "acknowledged_at": fault_ack_time, + "status": fault_status, + "health_delta": fault_delta, + "message": fault_message, + } + + return fault + + +def getAllFaults(zkhandler, sort_key="last_reported"): + """ + Get the details of all registered faults + """ + + all_faults = zkhandler.children(("base.faults")) + + faults_detail = list() + + for fault_id in all_faults: + fault_detail = getFault(zkhandler, fault_id) + faults_detail.append(fault_detail) + + sorted_faults = sorted(faults_detail, key=lambda x: x[sort_key]) + # Sort newest-first for time-based sorts + if sort_key in ["first_reported", "last_reported", "acknowledge_at"]: + sorted_faults.reverse() + + return sorted_faults + + +def get_list(zkhandler, limit=None, sort_key="last_reported"): + """ + Get a list of all known faults, sorted by {sort_key} + """ + if sort_key not in [ + "first_reported", + "last_reported", + "acknowledged_at", + "status", + "health_delta", + "message", + ]: + return False, f"Invalid sort key {sort_key} provided" + + all_faults = getAllFaults(zkhandler, sort_key=sort_key) + + if limit is not None: + all_faults = [fault for fault in all_faults if fault["id"] == limit] + + return True, all_faults + + +def acknowledge(zkhandler, fault_id): + """ + Acknowledge a fault + """ + fault = getFault(zkhandler, fault_id) + + if fault is None: + return False, f"No fault with ID {fault_id} found" + + zkhandler.write( + [ + (("faults.ack_time", fault_id), datetime.now()), + (("faults.status", fault_id), "ack"), + ] + ) + + return True, f"Successfully acknowledged fault {fault_id}" + + +def delete(zkhandler, fault_id): + """ + Delete a fault + """ + fault = getFault(zkhandler, fault_id) + + if fault is None: + return False, f"No fault with ID {fault_id} found" + + zkhandler.delete(("faults.id", fault_id), recursive=True) + + return True, f"Successfully deleted fault {fault_id}"