Add node network statistics and utilization values

Adds a new physical network interface stats parser to the node
keepalives, and leverages this information to provide a network
utilization overview in the Prometheus metrics.
This commit is contained in:
2023-12-21 15:12:20 -05:00
parent d2d2a9c617
commit 3e4cc53fdd
6 changed files with 386 additions and 13 deletions

View File

@ -619,6 +619,54 @@ def get_resource_metrics(zkhandler):
output_lines = list()
#
# Network Utilization stats
#
# This is a bit of a doozie. First, for each node, we have to determine the % utilization
# of all the (active) network interface on that node, averaged together. Then we average
# the values of all the nodes together.
# This is very rough, but should give some idea as to the total network bandwidth used
# and available.
all_total_speed = 0
all_total_util = 0
all_total_count = 0
for node in node_data:
if node["daemon_state"] != "run":
continue
total_speed = 0
total_util = 0
total_count = 0
for iface in node["interfaces"].keys():
link_state = node["interfaces"][iface]["state"]
if link_state != "up":
continue
link_speed = node["interfaces"][iface]["link_speed"] * 2 # full-duplex
total_speed += link_speed
total_bps = node["interfaces"][iface]["total_bps"]
total_util += total_bps
total_count += 1
if total_count > 0:
# Average the speed and util by the count
avg_speed = int(total_speed / total_count)
all_total_speed += avg_speed
avg_util = int(total_util / total_count)
all_total_util += avg_util
all_total_count += 1
if all_total_count > 0:
all_avg_speed = all_total_speed / all_total_count
all_avg_util = all_total_util / all_total_count
used_network_percentage = all_avg_util / all_avg_speed * 100
else:
used_network_percentage = 0
#
# Cluster stats
#
@ -633,7 +681,15 @@ def get_resource_metrics(zkhandler):
total_cpu = sum(node_sorted_cpu[:-2])
used_cpu = sum([n["load"] for n in node_data])
used_cpu_percentage = used_cpu / total_cpu * 100
output_lines.append(f"pvc_cluster_cpu_utilization {used_cpu_percentage:.2f}")
output_lines.append(f"pvc_cluster_cpu_utilization {used_cpu_percentage:2.2f}")
output_lines.append(
"# HELP pvc_cluster_network_utilization PVC cluster network utilization percentage"
)
output_lines.append("# TYPE pvc_cluster_network_utilization gauge")
output_lines.append(
f"pvc_cluster_network_utilization {used_network_percentage:2.2f}"
)
node_sorted_memory = [
n["memory"]["total"]
@ -648,7 +704,7 @@ def get_resource_metrics(zkhandler):
)
output_lines.append("# TYPE pvc_cluster_memory_real_utilization gauge")
output_lines.append(
f"pvc_cluster_memory_real_utilization {used_memory_percentage:.2f}"
f"pvc_cluster_memory_real_utilization {used_memory_percentage:2.2f}"
)
allocated_memory = sum([n["memory"]["allocated"] for n in node_data])
@ -658,7 +714,7 @@ def get_resource_metrics(zkhandler):
)
output_lines.append("# TYPE pvc_cluster_memory_allocated_utilization gauge")
output_lines.append(
f"pvc_cluster_memory_allocated_utilization {allocated_memory_percentage:.2f}"
f"pvc_cluster_memory_allocated_utilization {allocated_memory_percentage:2.2f}"
)
provisioned_memory = sum([n["memory"]["provisioned"] for n in node_data])
@ -668,7 +724,7 @@ def get_resource_metrics(zkhandler):
)
output_lines.append("# TYPE pvc_cluster_memory_provisioned_utilization gauge")
output_lines.append(
f"pvc_cluster_memory_provisioned_utilization {provisioned_memory_percentage:.2f}"
f"pvc_cluster_memory_provisioned_utilization {provisioned_memory_percentage:2.2f}"
)
output_lines.append(
@ -685,7 +741,7 @@ def get_resource_metrics(zkhandler):
except Exception:
continue
used_disk_percentage = used_disk / total_disk * 100
output_lines.append(f"pvc_cluster_disk_utilization {used_disk_percentage:.2f}")
output_lines.append(f"pvc_cluster_disk_utilization {used_disk_percentage:2.2f}")
#
# Node stats

View File

@ -103,6 +103,7 @@ def getNodeInformation(zkhandler, node_name):
_node_running_domains,
_node_health,
_node_health_plugins,
_node_network_stats,
) = zkhandler.read_many(
[
("node.state.daemon", node_name),
@ -121,6 +122,7 @@ def getNodeInformation(zkhandler, node_name):
("node.running_domains", node_name),
("node.monitoring.health", node_name),
("node.monitoring.plugins", node_name),
("node.network.stats", node_name),
]
)
@ -154,6 +156,8 @@ def getNodeInformation(zkhandler, node_name):
zkhandler, node_name, node_health_plugins
)
node_network_stats = json.loads(_node_network_stats)
# Construct a data structure to represent the data
node_information = {
"name": node_name,
@ -182,6 +186,7 @@ def getNodeInformation(zkhandler, node_name):
"used": node_mem_used,
"free": node_mem_free,
},
"interfaces": node_network_stats,
}
return node_information