Update benchmarks to include resource utilization

Adds additional polled information on node cpu, memory, and network
bandwidth for the node running the test. This should provide additional
useful information about the results of the test.

Also bumps the test format to 2 to ensure clients can handle the changes
properly.
This commit is contained in:
2024-09-18 10:19:12 -04:00
parent ecb812ccac
commit 736762901c
3 changed files with 418 additions and 232 deletions

View File

@ -19,31 +19,34 @@
#
###############################################################################
import os
import psutil
import psycopg2
import psycopg2.extras
import subprocess
from datetime import datetime
from json import loads, dumps
from time import sleep
from daemon_lib.celery import start, fail, log_info, update, finish
import daemon_lib.common as pvc_common
import daemon_lib.ceph as pvc_ceph
# Define the current test format
TEST_FORMAT = 1
TEST_FORMAT = 2
# We run a total of 8 tests, to give a generalized idea of performance on the cluster:
# 1. A sequential read test of 8GB with a 4M block size
# 2. A sequential write test of 8GB with a 4M block size
# 3. A random read test of 8GB with a 4M block size
# 4. A random write test of 8GB with a 4M block size
# 5. A random read test of 8GB with a 256k block size
# 6. A random write test of 8GB with a 256k block size
# 7. A random read test of 8GB with a 4k block size
# 8. A random write test of 8GB with a 4k block size
# 1. A sequential read test of 64GB with a 4M block size
# 2. A sequential write test of 64GB with a 4M block size
# 3. A random read test of 64GB with a 4M block size
# 4. A random write test of 64GB with a 4M block size
# 5. A random read test of 64GB with a 256k block size
# 6. A random write test of 64GB with a 256k block size
# 7. A random read test of 64GB with a 4k block size
# 8. A random write test of 64GB with a 4k block size
# Taken together, these 8 results should give a very good indication of the overall storage performance
# for a variety of workloads.
test_matrix = {
@ -100,7 +103,7 @@ test_matrix = {
# Specify the benchmark volume name and size
benchmark_volume_name = "pvcbenchmark"
benchmark_volume_size = "8G"
benchmark_volume_size = "64G"
#
@ -226,7 +229,7 @@ def cleanup_benchmark_volume(
def run_benchmark_job(
test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
config, test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
):
test_spec = test_matrix[test]
log_info(None, f"Running test '{test}'")
@ -256,24 +259,155 @@ def run_benchmark_job(
)
log_info(None, "Running fio job: {}".format(" ".join(fio_cmd.split())))
retcode, stdout, stderr = pvc_common.run_os_command(fio_cmd)
# Run the fio command manually instead of using our run_os_command wrapper
# This will help us gather statistics about this node while it's running
process = subprocess.Popen(
fio_cmd.split(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
# Wait 15 seconds for the test to start
log_info(None, "Waiting 15 seconds for test resource stabilization")
sleep(15)
# Set up function to get process CPU utilization by name
def get_cpu_utilization_by_name(process_name):
cpu_usage = 0
for proc in psutil.process_iter(["name", "cpu_percent"]):
if proc.info["name"] == process_name:
cpu_usage += proc.info["cpu_percent"]
return cpu_usage
# Set up function to get process memory utilization by name
def get_memory_utilization_by_name(process_name):
memory_usage = 0
for proc in psutil.process_iter(["name", "memory_percent"]):
if proc.info["name"] == process_name:
memory_usage += proc.info["memory_percent"]
return memory_usage
# Set up function to get network traffic utilization in bps
def get_network_traffic_bps(interface, duration=1):
# Get initial network counters
net_io_start = psutil.net_io_counters(pernic=True)
if interface not in net_io_start:
return None, None
stats_start = net_io_start[interface]
bytes_sent_start = stats_start.bytes_sent
bytes_recv_start = stats_start.bytes_recv
# Wait for the specified duration
sleep(duration)
# Get final network counters
net_io_end = psutil.net_io_counters(pernic=True)
stats_end = net_io_end[interface]
bytes_sent_end = stats_end.bytes_sent
bytes_recv_end = stats_end.bytes_recv
# Calculate bytes per second
bytes_sent_per_sec = (bytes_sent_end - bytes_sent_start) / duration
bytes_recv_per_sec = (bytes_recv_end - bytes_recv_start) / duration
# Convert to bits per second (bps)
bits_sent_per_sec = bytes_sent_per_sec * 8
bits_recv_per_sec = bytes_recv_per_sec * 8
bits_total_per_sec = bits_sent_per_sec + bits_recv_per_sec
return bits_sent_per_sec, bits_recv_per_sec, bits_total_per_sec
log_info(None, f"Starting system resource polling for test '{test}'")
storage_interface = config["storage_dev"]
total_cpus = psutil.cpu_count(logical=True)
ticks = 1
osd_cpu_utilization = 0
osd_memory_utilization = 0
mon_cpu_utilization = 0
mon_memory_utilization = 0
total_cpu_utilization = 0
total_memory_utilization = 0
storage_sent_bps = 0
storage_recv_bps = 0
storage_total_bps = 0
while process.poll() is None:
# Do collection of statistics like network bandwidth and cpu utilization
current_osd_cpu_utilization = get_cpu_utilization_by_name("ceph-osd")
current_osd_memory_utilization = get_memory_utilization_by_name("ceph-osd")
current_mon_cpu_utilization = get_cpu_utilization_by_name("ceph-mon")
current_mon_memory_utilization = get_memory_utilization_by_name("ceph-mon")
current_total_cpu_utilization = psutil.cpu_percent(interval=1)
current_total_memory_utilization = psutil.virtual_memory().percent
(
current_storage_sent_bps,
current_storage_recv_bps,
current_storage_total_bps,
) = get_network_traffic_bps(storage_interface)
# Recheck if the process is done yet; if it's not, we add the values and increase the ticks
# This helps ensure that if the process finishes earlier than the longer polls above,
# this particular tick isn't counted which can skew the average
if process.poll() is None:
osd_cpu_utilization += current_osd_cpu_utilization
osd_memory_utilization += current_osd_memory_utilization
mon_cpu_utilization += current_mon_cpu_utilization
mon_memory_utilization += current_mon_memory_utilization
total_cpu_utilization += current_total_cpu_utilization
total_memory_utilization += current_total_memory_utilization
storage_sent_bps += current_storage_sent_bps
storage_recv_bps += current_storage_recv_bps
storage_total_bps += current_storage_total_bps
ticks += 1
# Get the 1-minute load average and CPU utilization, which covers the test duration
load1, _, _ = os.getloadavg()
load1 = round(load1, 2)
# Calculate the average CPU utilization values over the runtime
# Divide the OSD and MON CPU utilization by the total number of CPU cores, because
# the total is divided this way
avg_osd_cpu_utilization = round(osd_cpu_utilization / ticks / total_cpus, 2)
avg_osd_memory_utilization = round(osd_memory_utilization / ticks, 2)
avg_mon_cpu_utilization = round(mon_cpu_utilization / ticks / total_cpus, 2)
avg_mon_memory_utilization = round(mon_memory_utilization / ticks, 2)
avg_total_cpu_utilization = round(total_cpu_utilization / ticks, 2)
avg_total_memory_utilization = round(total_memory_utilization / ticks, 2)
avg_storage_sent_bps = round(storage_sent_bps / ticks, 2)
avg_storage_recv_bps = round(storage_recv_bps / ticks, 2)
avg_storage_total_bps = round(storage_total_bps / ticks, 2)
stdout, stderr = process.communicate()
retcode = process.returncode
resource_data = {
"avg_cpu_util_percent": {
"total": avg_total_cpu_utilization,
"ceph-mon": avg_mon_cpu_utilization,
"ceph-osd": avg_osd_cpu_utilization,
},
"avg_memory_util_percent": {
"total": avg_total_memory_utilization,
"ceph-mon": avg_mon_memory_utilization,
"ceph-osd": avg_osd_memory_utilization,
},
"avg_network_util_bps": {
"sent": avg_storage_sent_bps,
"recv": avg_storage_recv_bps,
"total": avg_storage_total_bps,
},
}
try:
jstdout = loads(stdout)
if retcode:
raise
except Exception:
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
fail(
None,
f"Failed to run fio test '{test}': {stderr}",
)
return None, None
return jstdout
return resource_data, jstdout
def worker_run_benchmark(zkhandler, celery, config, pool):
@ -358,7 +492,8 @@ def worker_run_benchmark(zkhandler, celery, config, pool):
total=total_stages,
)
results[test] = run_benchmark_job(
resource_data, fio_data = run_benchmark_job(
config,
test,
pool,
job_name=job_name,
@ -366,6 +501,25 @@ def worker_run_benchmark(zkhandler, celery, config, pool):
db_cur=db_cur,
zkhandler=zkhandler,
)
if resource_data is None or fio_data is None:
cleanup_benchmark_volume(
pool,
job_name=job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
cleanup(
job_name,
db_conn=db_conn,
db_cur=db_cur,
zkhandler=zkhandler,
)
fail(
None,
f"Failed to run fio test '{test}'",
)
results[test] = {**resource_data, **fio_data}
# Phase 3 - cleanup
current_stage += 1