From c1782c50042cb271665ec357625b33f23a363323 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 28 Apr 2023 10:48:28 -0400 Subject: [PATCH] Add full/nearfull OSD health detection --- daemon-common/cluster.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/daemon-common/cluster.py b/daemon-common/cluster.py index 645dc165..ce448e22 100644 --- a/daemon-common/cluster.py +++ b/daemon-common/cluster.py @@ -51,6 +51,8 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): "vm_stopped": 10, "osd_out": 50, "osd_down": 10, + "osd_full": 50, + "osd_nearfull": 10, "memory_overprovisioned": 50, "ceph_err": 50, "ceph_warn": 10, @@ -110,6 +112,18 @@ def getClusterHealth(zkhandler, node_list, vm_list, ceph_osd_list): f"cluster: Ceph OSD {ceph_osd['id']} in {up_texts[ceph_osd['stats']['up']].upper()} state" ) + # Handle full or nearfull OSDs (>85%) + if ceph_osd["stats"]["utilization"] >= 90: + cluster_health_value -= health_delta_map["osd_full"] + cluster_health_messages.append( + f"cluster: Ceph OSD {ceph_osd['id']} is FULL ({ceph_osd['stats']['utilization']:.1f}% > 90%)" + ) + elif ceph_osd["stats"]["utilization"] >= 85: + cluster_health_value -= health_delta_map["osd_nearfull"] + cluster_health_messages.append( + f"cluster: Ceph OSD {ceph_osd['id']} is NEARFULL ({ceph_osd['stats']['utilization']:.1f}% > 85%)" + ) + # Check for (n-1) overprovisioning # Assume X nodes. If the total VM memory allocation (counting only running VMss) is greater than # the total memory of the (n-1) smallest nodes, trigger this warning.