Add node network statistics and utilization values

Adds a new physical network interface stats parser to the node keepalives, and leverages this information to provide a network utilization overview in the Prometheus metrics.
2023-12-21 15:12:20 -05:00
parent d2d2a9c617
commit 3e4cc53fdd
6 changed files with 386 additions and 13 deletions
--- a/daemon-common/cluster.py
+++ b/daemon-common/cluster.py
@ -619,6 +619,54 @@ def get_resource_metrics(zkhandler):

    output_lines = list()

+    #
+    # Network Utilization stats
+    #
+    # This is a bit of a doozie. First, for each node, we have to determine the % utilization
+    # of all the (active) network interface on that node, averaged together. Then we average
+    # the values of all the nodes together.
+    # This is very rough, but should give some idea as to the total network bandwidth used
+    # and available.
+    all_total_speed = 0
+    all_total_util = 0
+    all_total_count = 0
+    for node in node_data:
+        if node["daemon_state"] != "run":
+            continue
+
+        total_speed = 0
+        total_util = 0
+        total_count = 0
+        for iface in node["interfaces"].keys():
+            link_state = node["interfaces"][iface]["state"]
+            if link_state != "up":
+                continue
+
+            link_speed = node["interfaces"][iface]["link_speed"] * 2  # full-duplex
+            total_speed += link_speed
+
+            total_bps = node["interfaces"][iface]["total_bps"]
+            total_util += total_bps
+
+            total_count += 1
+
+        if total_count > 0:
+            # Average the speed and util by the count
+            avg_speed = int(total_speed / total_count)
+            all_total_speed += avg_speed
+            avg_util = int(total_util / total_count)
+            all_total_util += avg_util
+
+            all_total_count += 1
+
+    if all_total_count > 0:
+        all_avg_speed = all_total_speed / all_total_count
+        all_avg_util = all_total_util / all_total_count
+
+        used_network_percentage = all_avg_util / all_avg_speed * 100
+    else:
+        used_network_percentage = 0
+
    #
    # Cluster stats
    #
@ -633,7 +681,15 @@ def get_resource_metrics(zkhandler):
    total_cpu = sum(node_sorted_cpu[:-2])
    used_cpu = sum([n["load"] for n in node_data])
    used_cpu_percentage = used_cpu / total_cpu * 100
-    output_lines.append(f"pvc_cluster_cpu_utilization {used_cpu_percentage:.2f}")
+    output_lines.append(f"pvc_cluster_cpu_utilization {used_cpu_percentage:2.2f}")
+
+    output_lines.append(
+        "# HELP pvc_cluster_network_utilization PVC cluster network utilization percentage"
+    )
+    output_lines.append("# TYPE pvc_cluster_network_utilization gauge")
+    output_lines.append(
+        f"pvc_cluster_network_utilization {used_network_percentage:2.2f}"
+    )

    node_sorted_memory = [
        n["memory"]["total"]
@ -648,7 +704,7 @@ def get_resource_metrics(zkhandler):
    )
    output_lines.append("# TYPE pvc_cluster_memory_real_utilization gauge")
    output_lines.append(
-        f"pvc_cluster_memory_real_utilization {used_memory_percentage:.2f}"
+        f"pvc_cluster_memory_real_utilization {used_memory_percentage:2.2f}"
    )

    allocated_memory = sum([n["memory"]["allocated"] for n in node_data])
@ -658,7 +714,7 @@ def get_resource_metrics(zkhandler):
    )
    output_lines.append("# TYPE pvc_cluster_memory_allocated_utilization gauge")
    output_lines.append(
-        f"pvc_cluster_memory_allocated_utilization {allocated_memory_percentage:.2f}"
+        f"pvc_cluster_memory_allocated_utilization {allocated_memory_percentage:2.2f}"
    )

    provisioned_memory = sum([n["memory"]["provisioned"] for n in node_data])
@ -668,7 +724,7 @@ def get_resource_metrics(zkhandler):
    )
    output_lines.append("# TYPE pvc_cluster_memory_provisioned_utilization gauge")
    output_lines.append(
-        f"pvc_cluster_memory_provisioned_utilization {provisioned_memory_percentage:.2f}"
+        f"pvc_cluster_memory_provisioned_utilization {provisioned_memory_percentage:2.2f}"
    )

    output_lines.append(
@ -685,7 +741,7 @@ def get_resource_metrics(zkhandler):
        except Exception:
            continue
    used_disk_percentage = used_disk / total_disk * 100
-    output_lines.append(f"pvc_cluster_disk_utilization {used_disk_percentage:.2f}")
+    output_lines.append(f"pvc_cluster_disk_utilization {used_disk_percentage:2.2f}")

    #
    # Node stats
--- a/daemon-common/node.py
+++ b/daemon-common/node.py
@ -103,6 +103,7 @@ def getNodeInformation(zkhandler, node_name):
        _node_running_domains,
        _node_health,
        _node_health_plugins,
+        _node_network_stats,
    ) = zkhandler.read_many(
        [
            ("node.state.daemon", node_name),
@ -121,6 +122,7 @@ def getNodeInformation(zkhandler, node_name):
            ("node.running_domains", node_name),
            ("node.monitoring.health", node_name),
            ("node.monitoring.plugins", node_name),
+            ("node.network.stats", node_name),
        ]
    )

@ -154,6 +156,8 @@ def getNodeInformation(zkhandler, node_name):
        zkhandler, node_name, node_health_plugins
    )

+    node_network_stats = json.loads(_node_network_stats)
+
    # Construct a data structure to represent the data
    node_information = {
        "name": node_name,
@ -182,6 +186,7 @@ def getNodeInformation(zkhandler, node_name):
            "used": node_mem_used,
            "free": node_mem_free,
        },
+        "interfaces": node_network_stats,
    }
    return node_information