Improve fence handling to prevent anomalies

1. Move fence monitoring to its own thread rather than doing the listing and triggering within the main keepalive thread. 2. Add a global lock key at /config/fence_lock and use this lock key to prevent multiple nodes from trying to run fences simultaneously. 3. Run the fencing monitor for each node sequentially within the context of the main fence monitoring thread, to ensure that fences of multiple nodes happen sequentially rather than in parallel. All of these should help to prevent any anomalies where one node can try to fence multiple nodes at once without recourse.
2024-10-10 16:38:19 -04:00
parent ebec1332e9
commit a6f8500309
4 changed files with 71 additions and 41 deletions
--- a/daemon-common/zkhandler.py
+++ b/daemon-common/zkhandler.py
@@ -576,7 +576,7 @@ class ZKHandler(object):
 #
 class ZKSchema(object):
    # Current version
-    _version = 14
+    _version = 15

    # Root for doing nested keys
    _schema_root = ""
@@ -592,6 +592,7 @@ class ZKSchema(object):
            "schema.version": f"{_schema_root}/schema/version",
            "config": f"{_schema_root}/config",
            "config.maintenance": f"{_schema_root}/config/maintenance",
+            "config.fence_lock": f"{_schema_root}/config/fence_lock",
            "config.primary_node": f"{_schema_root}/config/primary_node",
            "config.primary_node.sync_lock": f"{_schema_root}/config/primary_node/sync_lock",
            "config.upstream_ip": f"{_schema_root}/config/upstream_ip",