From e7ab1bfddd98b3ed8b8b3cefbf7d3234c5e525b6 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Sun, 18 Oct 2020 14:46:32 -0400 Subject: [PATCH] Add cluster overprovision determination Adds a check of (n-1) memory overprovisioning. (n-1) is considered to be the configuration that excludes the "largest" node. The cluster will report degraded when in this state. --- daemon-common/cluster.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index 709884bf..99be4bd7 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -79,6 +79,36 @@ def getClusterInformation(zk_conn): ceph_volume_count = len(ceph_volume_list) ceph_snapshot_count = len(ceph_snapshot_list) + # Determinations for general cluster health + cluster_healthy_status = True + # Check for (n-1) overprovisioning + # Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than + # the total memory of the (n-1) smallest nodes, trigger this warning. + n_minus_1_total = 0 + alloc_total = 0 + + node_largest_index = None + node_largest_count = 0 + for index, node in enumerate(node_list): + node_mem_total = node['memory']['total'] + node_mem_alloc = node['memory']['allocated'] + alloc_total += node_mem_alloc + + # Determine if this node is the largest seen so far + if node_mem_total > node_largest_count: + node_largest_index = index + node_largest_count = node_mem_total + n_minus_1_node_list = list() + for index, node in enumerate(node_list): + if index == node_largest_index: + continue + n_minus_1_node_list.append(node) + for index, node in enumerate(n_minus_1_node_list): + n_minus_1_total += node['memory']['total'] + if alloc_total > n_minus_1_total: + cluster_healthy_status = False + cluster_health_msg.append("Total VM memory ({}) is overprovisioned (max {}) for (n-1) failure scenarios".format(alloc_total, n_minus_1_total)) + # Determinations for node health node_healthy_status = list(range(0, node_count)) node_report_status = list(range(0, node_count)) @@ -131,7 +161,7 @@ def getClusterInformation(zk_conn): # Find out the overall cluster health; if any element of a healthy_status is false, it's unhealthy if maint_state == 'true': cluster_health = 'Maintenance' - elif False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status: + elif cluster_healthy_status is False or False in node_healthy_status or False in vm_healthy_status or False in ceph_osd_healthy_status: cluster_health = 'Degraded' else: cluster_health = 'Optimal'