Port all Celery worker functions to discrete pkg

Moves all tasks run by the Celery worker into a discrete package/module for easier installation. Also adjusts several parameters throughout to accomplish this.
2023-11-30 02:01:22 -05:00
parent 0c0fb65c62
commit 102c3c3106
20 changed files with 352 additions and 71 deletions
--- a/daemon-common/benchmark.py
+++ b/daemon-common/benchmark.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+
+# benchmark.py - PVC API Benchmark functions
+# Part of the Parallel Virtual Cluster (PVC) system
+#
+#    Copyright (C) 2018-2022 Joshua M. Boniface <joshua@boniface.me>
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, version 3.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+import psycopg2
+import psycopg2.extras
+
+from datetime import datetime
+from json import loads, dumps
+
+from daemon_lib.celery import start, fail, log_info, update, finish
+
+import daemon_lib.common as pvc_common
+import daemon_lib.ceph as pvc_ceph
+
+
+# Define the current test format
+TEST_FORMAT = 1
+
+
+# We run a total of 8 tests, to give a generalized idea of performance on the cluster:
+#   1. A sequential read test of 8GB with a 4M block size
+#   2. A sequential write test of 8GB with a 4M block size
+#   3. A random read test of 8GB with a 4M block size
+#   4. A random write test of 8GB with a 4M block size
+#   5. A random read test of 8GB with a 256k block size
+#   6. A random write test of 8GB with a 256k block size
+#   7. A random read test of 8GB with a 4k block size
+#   8. A random write test of 8GB with a 4k block size
+# Taken together, these 8 results should give a very good indication of the overall storage performance
+# for a variety of workloads.
+test_matrix = {
+    "seq_read": {
+        "direction": "read",
+        "iodepth": "64",
+        "bs": "4M",
+        "rw": "read",
+    },
+    "seq_write": {
+        "direction": "write",
+        "iodepth": "64",
+        "bs": "4M",
+        "rw": "write",
+    },
+    "rand_read_4M": {
+        "direction": "read",
+        "iodepth": "64",
+        "bs": "4M",
+        "rw": "randread",
+    },
+    "rand_write_4M": {
+        "direction": "write",
+        "iodepth": "64",
+        "bs": "4M",
+        "rw": "randwrite",
+    },
+    "rand_read_4K": {
+        "direction": "read",
+        "iodepth": "64",
+        "bs": "4K",
+        "rw": "randread",
+    },
+    "rand_write_4K": {
+        "direction": "write",
+        "iodepth": "64",
+        "bs": "4K",
+        "rw": "randwrite",
+    },
+    "rand_read_4K_lowdepth": {
+        "direction": "read",
+        "iodepth": "1",
+        "bs": "4K",
+        "rw": "randread",
+    },
+    "rand_write_4K_lowdepth": {
+        "direction": "write",
+        "iodepth": "1",
+        "bs": "4K",
+        "rw": "randwrite",
+    },
+}
+
+
+# Specify the benchmark volume name and size
+benchmark_volume_name = "pvcbenchmark"
+benchmark_volume_size = "8G"
+
+
+#
+# Exceptions (used by Celery tasks)
+#
+class BenchmarkError(Exception):
+    pass
+
+
+#
+# Common functions
+#
+
+
+def cleanup(job_name, db_conn=None, db_cur=None, zkhandler=None):
+    if db_conn is not None and db_cur is not None:
+        # Clean up our dangling result
+        query = "DELETE FROM storage_benchmarks WHERE job = %s;"
+        args = (job_name,)
+        db_cur.execute(query, args)
+        db_conn.commit()
+        # Close the database connections cleanly
+        close_database(db_conn, db_cur)
+    if zkhandler is not None:
+        zkhandler.disconnect()
+        del zkhandler
+
+
+# Database connections
+def open_database(config):
+    conn = psycopg2.connect(
+        host=config["api_postgresql_host"],
+        port=config["api_postgresql_port"],
+        dbname=config["api_postgresql_dbname"],
+        user=config["api_postgresql_user"],
+        password=config["api_postgresql_password"],
+    )
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    return conn, cur
+
+
+def close_database(conn, cur, failed=False):
+    if not failed:
+        conn.commit()
+    cur.close()
+    conn.close()
+
+
+def list_benchmarks(config, job=None):
+    if job is not None:
+        query = "SELECT * FROM {} WHERE job = %s;".format("storage_benchmarks")
+        args = (job,)
+    else:
+        query = "SELECT * FROM {} ORDER BY id DESC;".format("storage_benchmarks")
+        args = ()
+
+    conn, cur = open_database(config)
+    cur.execute(query, args)
+    orig_data = cur.fetchall()
+    data = list()
+    for benchmark in orig_data:
+        benchmark_data = dict()
+        benchmark_data["id"] = benchmark["id"]
+        benchmark_data["job"] = benchmark["job"]
+        benchmark_data["test_format"] = benchmark["test_format"]
+        if benchmark["result"] == "Running":
+            benchmark_data["benchmark_result"] = "Running"
+        else:
+            try:
+                benchmark_data["benchmark_result"] = loads(benchmark["result"])
+            except Exception:
+                benchmark_data["benchmark_result"] = {}
+        # Append the new data to our actual output structure
+        data.append(benchmark_data)
+    close_database(conn, cur)
+    if data:
+        return data, 200
+    else:
+        return {"message": "No benchmark found."}, 404
+
+
+def prepare_benchmark_volume(
+    pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
+):
+    # Create the RBD volume
+    retcode, retmsg = pvc_ceph.add_volume(
+        zkhandler, pool, benchmark_volume_name, benchmark_volume_size
+    )
+    if not retcode:
+        cleanup(
+            job_name,
+            db_conn=db_conn,
+            db_cur=db_cur,
+            zkhandler=zkhandler,
+        )
+        fail(
+            None,
+            f'Failed to create volume "{benchmark_volume_name}" on pool "{pool}": {retmsg}',
+        )
+    else:
+        log_info(None, retmsg)
+
+
+def cleanup_benchmark_volume(
+    pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
+):
+    # Remove the RBD volume
+    retcode, retmsg = pvc_ceph.remove_volume(zkhandler, pool, benchmark_volume_name)
+    if not retcode:
+        cleanup(
+            job_name,
+            db_conn=db_conn,
+            db_cur=db_cur,
+            zkhandler=zkhandler,
+        )
+        fail(
+            None,
+            f'Failed to remove volume "{benchmark_volume_name}" from pool "{pool}": {retmsg}',
+        )
+    else:
+        log_info(None, retmsg)
+
+
+def run_benchmark_job(
+    test, pool, job_name=None, db_conn=None, db_cur=None, zkhandler=None
+):
+    test_spec = test_matrix[test]
+    log_info(None, f"Running test '{test}'")
+    fio_cmd = """
+            fio \
+                --name={test} \
+                --ioengine=rbd \
+                --pool={pool} \
+                --rbdname={volume} \
+                --output-format=json \
+                --direct=1 \
+                --randrepeat=1 \
+                --numjobs=1 \
+                --time_based \
+                --runtime=75 \
+                --group_reporting \
+                --iodepth={iodepth} \
+                --bs={bs} \
+                --readwrite={rw}
+        """.format(
+        test=test,
+        pool=pool,
+        volume=benchmark_volume_name,
+        iodepth=test_spec["iodepth"],
+        bs=test_spec["bs"],
+        rw=test_spec["rw"],
+    )
+
+    log_info(None, "Running fio job: {}".format(" ".join(fio_cmd.split())))
+    retcode, stdout, stderr = pvc_common.run_os_command(fio_cmd)
+    try:
+        jstdout = loads(stdout)
+        if retcode:
+            raise
+    except Exception:
+        cleanup(
+            job_name,
+            db_conn=db_conn,
+            db_cur=db_cur,
+            zkhandler=zkhandler,
+        )
+        fail(
+            None,
+            f"Failed to run fio test '{test}': {stderr}",
+        )
+
+    return jstdout
+
+
+def worker_run_benchmark(zkhandler, celery, config, pool):
+    # Phase 0 - connect to databases
+    cur_time = datetime.now().isoformat(timespec="seconds")
+    cur_primary = zkhandler.read("base.config.primary_node")
+    job_name = f"{cur_time}_{cur_primary}"
+
+    current_stage = 0
+    total_stages = 13
+    start(
+        celery,
+        f"Running storage benchmark '{job_name}' on pool '{pool}'",
+        current=current_stage,
+        total=total_stages,
+    )
+
+    try:
+        db_conn, db_cur = open_database(config)
+    except Exception:
+        cleanup(
+            job_name,
+            db_conn=None,
+            db_cur=None,
+            zkhandler=zkhandler,
+        )
+        fail(
+            celery,
+            "Failed to connect to Postgres",
+        )
+
+    current_stage += 1
+    update(
+        celery,
+        "Storing running status in database",
+        current=current_stage,
+        total=total_stages,
+    )
+
+    try:
+        query = "INSERT INTO storage_benchmarks (job, test_format, result) VALUES (%s, %s, %s);"
+        args = (
+            job_name,
+            TEST_FORMAT,
+            "Running",
+        )
+        db_cur.execute(query, args)
+        db_conn.commit()
+    except Exception as e:
+        cleanup(
+            job_name,
+            db_conn=db_conn,
+            db_cur=db_cur,
+            zkhandler=zkhandler,
+        )
+        fail(celery, f"Failed to store running status: {e}", exception=BenchmarkError)
+
+    current_stage += 1
+    update(
+        celery,
+        "Creating benchmark volume",
+        current=current_stage,
+        total=total_stages,
+    )
+
+    prepare_benchmark_volume(
+        pool,
+        job_name=job_name,
+        db_conn=db_conn,
+        db_cur=db_cur,
+        zkhandler=zkhandler,
+    )
+
+    # Phase 2 - benchmark run
+    results = dict()
+    for test in test_matrix:
+        current_stage += 1
+        update(
+            celery,
+            f"Running benchmark job '{test}'",
+            current=current_stage,
+            total=total_stages,
+        )
+
+        results[test] = run_benchmark_job(
+            test,
+            pool,
+            job_name=job_name,
+            db_conn=db_conn,
+            db_cur=db_cur,
+            zkhandler=zkhandler,
+        )
+
+    # Phase 3 - cleanup
+    current_stage += 1
+    update(
+        celery,
+        "Cleaning up venchmark volume",
+        current=current_stage,
+        total=total_stages,
+    )
+
+    cleanup_benchmark_volume(
+        pool,
+        job_name=job_name,
+        db_conn=db_conn,
+        db_cur=db_cur,
+        zkhandler=zkhandler,
+    )
+
+    current_stage += 1
+    update(
+        celery,
+        "Storing results in database",
+        current=current_stage,
+        total=total_stages,
+    )
+
+    try:
+        query = "UPDATE storage_benchmarks SET result = %s WHERE job = %s;"
+        args = (dumps(results), job_name)
+        db_cur.execute(query, args)
+        db_conn.commit()
+    except Exception as e:
+        cleanup(
+            job_name,
+            db_conn=db_conn,
+            db_cur=db_cur,
+            zkhandler=zkhandler,
+        )
+        fail(celery, f"Failed to store test results: {e}", exception=BenchmarkError)
+
+    cleanup(
+        job_name,
+        db_conn=db_conn,
+        db_cur=db_cur,
+        zkhandler=zkhandler,
+    )
+
+    current_stage += 1
+    return finish(
+        celery,
+        f"Storage benchmark {job_name} completed successfully",
+        current=current_stage,
+        total=total_stages,
+    )