From d2a5fe59c0e023fd937ef22239c61b7797b048e8 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 19 Feb 2020 13:18:38 -0500 Subject: [PATCH] Use transitional takeover states for migration Use a pair of transitional states, "takeover" and "relinquish", when transitioning between primary and secondary coordinator states. This provides a clsuter-wide record that the nodes are still working during their synchronous transition states, and should allow clients to determine when the node(s) have fully switched over. Also add an additional 2 seconds of wait at the end of the transition jobs to ensure everything has had a chance to start before proceeding. References #72 --- node-daemon/pvcnoded/Daemon.py | 24 ++++++++++++++---------- node-daemon/pvcnoded/NodeInstance.py | 17 +++++++++++++---- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index 7e20c951..7eae1fee 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -575,14 +575,17 @@ def cleanup(): pass # Force into secondary network state if needed - if zkhandler.readdata(zk_conn, '/nodes/{}/routerstate'.format(myhostname)) == 'primary': - is_primary = True - zkhandler.writedata(zk_conn, { - '/nodes/{}/routerstate'.format(myhostname): 'secondary', - '/primary_node': 'none' - }) - logger.out('Waiting 5 seconds for primary migration', state='s') - time.sleep(5) + try: + if this_node.router_state == 'primary': + is_primary = True + zkhandler.writedata(zk_conn, { + '/primary_node': 'none' + }) + logger.out('Waiting for primary migration', state='s') + while this_node.router_state != 'secondary': + time.sleep(1) + except: + pass # Set stop state in Zookeeper zkhandler.writedata(zk_conn, { '/nodes/{}/daemonstate'.format(myhostname): 'stop' }) @@ -825,9 +828,10 @@ def update_primary(new_primary, stat, event=''): logger.out('Contending for primary coordinator state', state='i') zkhandler.writedata(zk_conn, {'/primary_node': myhostname}) elif new_primary == myhostname: - zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'primary'}) + zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'takeover'}) else: - zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'secondary'}) + if this_node.router_state != 'secondary': + zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'relinquish'}) else: zkhandler.writedata(zk_conn, {'/nodes/{}/routerstate'.format(myhostname): 'client'}) diff --git a/node-daemon/pvcnoded/NodeInstance.py b/node-daemon/pvcnoded/NodeInstance.py index 9c673f6b..fdbcc7a5 100644 --- a/node-daemon/pvcnoded/NodeInstance.py +++ b/node-daemon/pvcnoded/NodeInstance.py @@ -117,16 +117,19 @@ class NodeInstance(object): if data != self.router_state: self.router_state = data if self.config['enable_networking']: - if self.router_state == 'primary': + if self.router_state == 'takeover': self.logger.out('Setting node {} to primary state'.format(self.name), state='i') transition_thread = threading.Thread(target=self.become_primary, args=(), kwargs={}) transition_thread.start() - else: + if self.router_state == 'relinquish': # Skip becoming secondary unless already running if self.daemon_state == 'run' or self.daemon_state == 'shutdown': self.logger.out('Setting node {} to secondary state'.format(self.name), state='i') transition_thread = threading.Thread(target=self.become_secondary, args=(), kwargs={}) transition_thread.start() + else: + # We did nothing, so just become secondary state + zkhandler.writedata(self.zk_conn, {'/nodes/{}/routerstate'.format(self.name): 'secondary'}) @self.zk_conn.DataWatch('/nodes/{}/domainstate'.format(self.name)) def watch_node_domainstate(data, stat, event=''): @@ -428,8 +431,8 @@ class NodeInstance(object): self.logger.out('Setting Patroni leader to this node', state='i') tick = 1 patroni_failed = True - # As long as we're primary, keep trying to set the Patroni leader to us - while self.router_state == 'primary': + # As long as we're in takeover, keep trying to set the Patroni leader to us + while self.router_state == 'takeover': # Switch Patroni leader to the local instance retcode, stdout, stderr = common.run_os_command( """ @@ -489,7 +492,10 @@ class NodeInstance(object): lock.release() self.logger.out('Released write lock for synchronization G', state='o') + # Wait 2 seconds for everything to stabilize before we declare all-done + time.sleep(2) primary_lock.release() + zkhandler.writedata(self.zk_conn, {'/nodes/{}/routerstate'.format(self.name): 'primary'}) self.logger.out('Node {} transitioned to primary state'.format(self.name), state='o') def become_secondary(self): @@ -611,6 +617,9 @@ class NodeInstance(object): lock.release() self.logger.out('Released read lock for synchronization G', state='o') + # Wait 2 seconds for everything to stabilize before we declare all-done + time.sleep(2) + zkhandler.writedata(self.zk_conn, {'/nodes/{}/routerstate'.format(self.name): 'secondary'}) self.logger.out('Node {} transitioned to secondary state'.format(self.name), state='o') # Flush all VMs on the host