Reformat code with Black code formatter

Unify the code style along PEP and Black principles using the tool.
This commit is contained in:
2021-11-06 03:02:43 -04:00
parent 3779bc960e
commit c41664d2da
47 changed files with 15547 additions and 10151 deletions

View File

@ -36,8 +36,9 @@ class MalformedConfigurationError(Exception):
"""
An except when parsing the PVC Node daemon configuration file
"""
def __init__(self, error=None):
self.msg = f'ERROR: Configuration file is malformed: {error}'
self.msg = f"ERROR: Configuration file is malformed: {error}"
def __str__(self):
return str(self.msg)
@ -50,19 +51,19 @@ def get_static_data():
staticdata = list()
staticdata.append(str(cpu_count())) # CPU count
staticdata.append(
subprocess.run(
['uname', '-r'], stdout=subprocess.PIPE
).stdout.decode('ascii').strip()
subprocess.run(["uname", "-r"], stdout=subprocess.PIPE)
.stdout.decode("ascii")
.strip()
)
staticdata.append(
subprocess.run(
['uname', '-o'], stdout=subprocess.PIPE
).stdout.decode('ascii').strip()
subprocess.run(["uname", "-o"], stdout=subprocess.PIPE)
.stdout.decode("ascii")
.strip()
)
staticdata.append(
subprocess.run(
['uname', '-m'], stdout=subprocess.PIPE
).stdout.decode('ascii').strip()
subprocess.run(["uname", "-m"], stdout=subprocess.PIPE)
.stdout.decode("ascii")
.strip()
)
return staticdata
@ -70,7 +71,7 @@ def get_static_data():
def get_configuration_path():
try:
return os.environ['PVCD_CONFIG_FILE']
return os.environ["PVCD_CONFIG_FILE"]
except KeyError:
print('ERROR: The "PVCD_CONFIG_FILE" environment variable must be set.')
os._exit(1)
@ -78,10 +79,10 @@ def get_configuration_path():
def get_hostname():
node_fqdn = gethostname()
node_hostname = node_fqdn.split('.', 1)[0]
node_domain = ''.join(node_fqdn.split('.', 1)[1:])
node_hostname = node_fqdn.split(".", 1)[0]
node_domain = "".join(node_fqdn.split(".", 1)[1:])
try:
node_id = findall(r'\d+', node_hostname)[-1]
node_id = findall(r"\d+", node_hostname)[-1]
except IndexError:
node_id = 0
@ -89,27 +90,33 @@ def get_hostname():
def validate_floating_ip(config, network):
if network not in ['cluster', 'storage', 'upstream']:
if network not in ["cluster", "storage", "upstream"]:
return False, f'Specified network type "{network}" is not valid'
floating_key = f'{network}_floating_ip'
network_key = f'{network}_network'
floating_key = f"{network}_floating_ip"
network_key = f"{network}_network"
# Verify the network provided is valid
try:
network = ip_network(config[network_key])
except Exception:
return False, f'Network address {config[network_key]} for {network_key} is not valid'
return (
False,
f"Network address {config[network_key]} for {network_key} is not valid",
)
# Verify that the floating IP is valid (and in the network)
try:
floating_address = ip_address(config[floating_key].split('/')[0])
floating_address = ip_address(config[floating_key].split("/")[0])
if floating_address not in list(network.hosts()):
raise
except Exception:
return False, f'Floating address {config[floating_key]} for {floating_key} is not valid'
return (
False,
f"Floating address {config[floating_key]} for {floating_key} is not valid",
)
return True, ''
return True, ""
def get_configuration():
@ -120,11 +127,11 @@ def get_configuration():
print('Loading configuration from file "{}"'.format(pvcnoded_config_file))
with open(pvcnoded_config_file, 'r') as cfgfile:
with open(pvcnoded_config_file, "r") as cfgfile:
try:
o_config = yaml.load(cfgfile, Loader=yaml.SafeLoader)
except Exception as e:
print('ERROR: Failed to parse configuration file: {}'.format(e))
print("ERROR: Failed to parse configuration file: {}".format(e))
os._exit(1)
node_fqdn, node_hostname, node_domain, node_id = get_hostname()
@ -134,263 +141,287 @@ def get_configuration():
# Get the initial base configuration
try:
o_base = o_config['pvc']
o_cluster = o_config['pvc']['cluster']
o_base = o_config["pvc"]
o_cluster = o_config["pvc"]["cluster"]
except Exception as e:
raise MalformedConfigurationError(e)
config_general = {
'node': o_base.get('node', node_hostname),
'node_hostname': node_hostname,
'node_fqdn': node_fqdn,
'node_domain': node_domain,
'node_id': node_id,
'coordinators': o_cluster.get('coordinators', list()),
'debug': o_base.get('debug', False),
"node": o_base.get("node", node_hostname),
"node_hostname": node_hostname,
"node_fqdn": node_fqdn,
"node_domain": node_domain,
"node_id": node_id,
"coordinators": o_cluster.get("coordinators", list()),
"debug": o_base.get("debug", False),
}
config = {**config, **config_general}
# Get the functions configuration
try:
o_functions = o_config['pvc']['functions']
o_functions = o_config["pvc"]["functions"]
except Exception as e:
raise MalformedConfigurationError(e)
config_functions = {
'enable_hypervisor': o_functions.get('enable_hypervisor', False),
'enable_networking': o_functions.get('enable_networking', False),
'enable_storage': o_functions.get('enable_storage', False),
'enable_api': o_functions.get('enable_api', False),
"enable_hypervisor": o_functions.get("enable_hypervisor", False),
"enable_networking": o_functions.get("enable_networking", False),
"enable_storage": o_functions.get("enable_storage", False),
"enable_api": o_functions.get("enable_api", False),
}
config = {**config, **config_functions}
# Get the directory configuration
try:
o_directories = o_config['pvc']['system']['configuration']['directories']
o_directories = o_config["pvc"]["system"]["configuration"]["directories"]
except Exception as e:
raise MalformedConfigurationError(e)
config_directories = {
'dynamic_directory': o_directories.get('dynamic_directory', None),
'log_directory': o_directories.get('log_directory', None),
'console_log_directory': o_directories.get('console_log_directory', None),
"dynamic_directory": o_directories.get("dynamic_directory", None),
"log_directory": o_directories.get("log_directory", None),
"console_log_directory": o_directories.get("console_log_directory", None),
}
# Define our dynamic directory schema
config_directories['dnsmasq_dynamic_directory'] = config_directories['dynamic_directory'] + '/dnsmasq'
config_directories['pdns_dynamic_directory'] = config_directories['dynamic_directory'] + '/pdns'
config_directories['nft_dynamic_directory'] = config_directories['dynamic_directory'] + '/nft'
config_directories["dnsmasq_dynamic_directory"] = (
config_directories["dynamic_directory"] + "/dnsmasq"
)
config_directories["pdns_dynamic_directory"] = (
config_directories["dynamic_directory"] + "/pdns"
)
config_directories["nft_dynamic_directory"] = (
config_directories["dynamic_directory"] + "/nft"
)
# Define our log directory schema
config_directories['dnsmasq_log_directory'] = config_directories['log_directory'] + '/dnsmasq'
config_directories['pdns_log_directory'] = config_directories['log_directory'] + '/pdns'
config_directories['nft_log_directory'] = config_directories['log_directory'] + '/nft'
config_directories["dnsmasq_log_directory"] = (
config_directories["log_directory"] + "/dnsmasq"
)
config_directories["pdns_log_directory"] = (
config_directories["log_directory"] + "/pdns"
)
config_directories["nft_log_directory"] = (
config_directories["log_directory"] + "/nft"
)
config = {**config, **config_directories}
# Get the logging configuration
try:
o_logging = o_config['pvc']['system']['configuration']['logging']
o_logging = o_config["pvc"]["system"]["configuration"]["logging"]
except Exception as e:
raise MalformedConfigurationError(e)
config_logging = {
'file_logging': o_logging.get('file_logging', False),
'stdout_logging': o_logging.get('stdout_logging', False),
'zookeeper_logging': o_logging.get('zookeeper_logging', False),
'log_colours': o_logging.get('log_colours', False),
'log_dates': o_logging.get('log_dates', False),
'log_keepalives': o_logging.get('log_keepalives', False),
'log_keepalive_cluster_details': o_logging.get('log_keepalive_cluster_details', False),
'log_keepalive_storage_details': o_logging.get('log_keepalive_storage_details', False),
'console_log_lines': o_logging.get('console_log_lines', False),
'node_log_lines': o_logging.get('node_log_lines', False),
"file_logging": o_logging.get("file_logging", False),
"stdout_logging": o_logging.get("stdout_logging", False),
"zookeeper_logging": o_logging.get("zookeeper_logging", False),
"log_colours": o_logging.get("log_colours", False),
"log_dates": o_logging.get("log_dates", False),
"log_keepalives": o_logging.get("log_keepalives", False),
"log_keepalive_cluster_details": o_logging.get(
"log_keepalive_cluster_details", False
),
"log_keepalive_storage_details": o_logging.get(
"log_keepalive_storage_details", False
),
"console_log_lines": o_logging.get("console_log_lines", False),
"node_log_lines": o_logging.get("node_log_lines", False),
}
config = {**config, **config_logging}
# Get the interval configuration
try:
o_intervals = o_config['pvc']['system']['intervals']
o_intervals = o_config["pvc"]["system"]["intervals"]
except Exception as e:
raise MalformedConfigurationError(e)
config_intervals = {
'vm_shutdown_timeout': int(o_intervals.get('vm_shutdown_timeout', 60)),
'keepalive_interval': int(o_intervals.get('keepalive_interval', 5)),
'fence_intervals': int(o_intervals.get('fence_intervals', 6)),
'suicide_intervals': int(o_intervals.get('suicide_interval', 0)),
"vm_shutdown_timeout": int(o_intervals.get("vm_shutdown_timeout", 60)),
"keepalive_interval": int(o_intervals.get("keepalive_interval", 5)),
"fence_intervals": int(o_intervals.get("fence_intervals", 6)),
"suicide_intervals": int(o_intervals.get("suicide_interval", 0)),
}
config = {**config, **config_intervals}
# Get the fencing configuration
try:
o_fencing = o_config['pvc']['system']['fencing']
o_fencing_actions = o_fencing['actions']
o_fencing_ipmi = o_fencing['ipmi']
o_fencing = o_config["pvc"]["system"]["fencing"]
o_fencing_actions = o_fencing["actions"]
o_fencing_ipmi = o_fencing["ipmi"]
except Exception as e:
raise MalformedConfigurationError(e)
config_fencing = {
'successful_fence': o_fencing_actions.get('successful_fence', None),
'failed_fence': o_fencing_actions.get('failed_fence', None),
'ipmi_hostname': o_fencing_ipmi.get('host', f'{node_hostname}-lom.{node_domain}'),
'ipmi_username': o_fencing_ipmi.get('user', 'null'),
'ipmi_password': o_fencing_ipmi.get('pass', 'null'),
"successful_fence": o_fencing_actions.get("successful_fence", None),
"failed_fence": o_fencing_actions.get("failed_fence", None),
"ipmi_hostname": o_fencing_ipmi.get(
"host", f"{node_hostname}-lom.{node_domain}"
),
"ipmi_username": o_fencing_ipmi.get("user", "null"),
"ipmi_password": o_fencing_ipmi.get("pass", "null"),
}
config = {**config, **config_fencing}
# Get the migration configuration
try:
o_migration = o_config['pvc']['system']['migration']
o_migration = o_config["pvc"]["system"]["migration"]
except Exception as e:
raise MalformedConfigurationError(e)
config_migration = {
'migration_target_selector': o_migration.get('target_selector', 'mem'),
"migration_target_selector": o_migration.get("target_selector", "mem"),
}
config = {**config, **config_migration}
if config['enable_networking']:
if config["enable_networking"]:
# Get the node networks configuration
try:
o_networks = o_config['pvc']['cluster']['networks']
o_network_cluster = o_networks['cluster']
o_network_storage = o_networks['storage']
o_network_upstream = o_networks['upstream']
o_sysnetworks = o_config['pvc']['system']['configuration']['networking']
o_sysnetwork_cluster = o_sysnetworks['cluster']
o_sysnetwork_storage = o_sysnetworks['storage']
o_sysnetwork_upstream = o_sysnetworks['upstream']
o_networks = o_config["pvc"]["cluster"]["networks"]
o_network_cluster = o_networks["cluster"]
o_network_storage = o_networks["storage"]
o_network_upstream = o_networks["upstream"]
o_sysnetworks = o_config["pvc"]["system"]["configuration"]["networking"]
o_sysnetwork_cluster = o_sysnetworks["cluster"]
o_sysnetwork_storage = o_sysnetworks["storage"]
o_sysnetwork_upstream = o_sysnetworks["upstream"]
except Exception as e:
raise MalformedConfigurationError(e)
config_networks = {
'cluster_domain': o_network_cluster.get('domain', None),
'cluster_network': o_network_cluster.get('network', None),
'cluster_floating_ip': o_network_cluster.get('floating_ip', None),
'cluster_dev': o_sysnetwork_cluster.get('device', None),
'cluster_mtu': o_sysnetwork_cluster.get('mtu', None),
'cluster_dev_ip': o_sysnetwork_cluster.get('address', None),
'storage_domain': o_network_storage.get('domain', None),
'storage_network': o_network_storage.get('network', None),
'storage_floating_ip': o_network_storage.get('floating_ip', None),
'storage_dev': o_sysnetwork_storage.get('device', None),
'storage_mtu': o_sysnetwork_storage.get('mtu', None),
'storage_dev_ip': o_sysnetwork_storage.get('address', None),
'upstream_domain': o_network_upstream.get('domain', None),
'upstream_network': o_network_upstream.get('network', None),
'upstream_floating_ip': o_network_upstream.get('floating_ip', None),
'upstream_gateway': o_network_upstream.get('gateway', None),
'upstream_dev': o_sysnetwork_upstream.get('device', None),
'upstream_mtu': o_sysnetwork_upstream.get('mtu', None),
'upstream_dev_ip': o_sysnetwork_upstream.get('address', None),
'bridge_dev': o_sysnetworks.get('bridge_device', None),
'bridge_mtu': o_sysnetworks.get('bridge_mtu', None),
'enable_sriov': o_sysnetworks.get('sriov_enable', False),
'sriov_device': o_sysnetworks.get('sriov_device', list())
"cluster_domain": o_network_cluster.get("domain", None),
"cluster_network": o_network_cluster.get("network", None),
"cluster_floating_ip": o_network_cluster.get("floating_ip", None),
"cluster_dev": o_sysnetwork_cluster.get("device", None),
"cluster_mtu": o_sysnetwork_cluster.get("mtu", None),
"cluster_dev_ip": o_sysnetwork_cluster.get("address", None),
"storage_domain": o_network_storage.get("domain", None),
"storage_network": o_network_storage.get("network", None),
"storage_floating_ip": o_network_storage.get("floating_ip", None),
"storage_dev": o_sysnetwork_storage.get("device", None),
"storage_mtu": o_sysnetwork_storage.get("mtu", None),
"storage_dev_ip": o_sysnetwork_storage.get("address", None),
"upstream_domain": o_network_upstream.get("domain", None),
"upstream_network": o_network_upstream.get("network", None),
"upstream_floating_ip": o_network_upstream.get("floating_ip", None),
"upstream_gateway": o_network_upstream.get("gateway", None),
"upstream_dev": o_sysnetwork_upstream.get("device", None),
"upstream_mtu": o_sysnetwork_upstream.get("mtu", None),
"upstream_dev_ip": o_sysnetwork_upstream.get("address", None),
"bridge_dev": o_sysnetworks.get("bridge_device", None),
"bridge_mtu": o_sysnetworks.get("bridge_mtu", None),
"enable_sriov": o_sysnetworks.get("sriov_enable", False),
"sriov_device": o_sysnetworks.get("sriov_device", list()),
}
if config_networks['bridge_mtu'] is None:
if config_networks["bridge_mtu"] is None:
# Read the current MTU of bridge_dev and set bridge_mtu to it; avoids weird resets
retcode, stdout, stderr = common.run_os_command(f"ip -json link show dev {config_networks['bridge_dev']}")
current_bridge_mtu = loads(stdout)[0]['mtu']
print(f"Config key bridge_mtu not explicitly set; using live MTU {current_bridge_mtu} from {config_networks['bridge_dev']}")
config_networks['bridge_mtu'] = current_bridge_mtu
retcode, stdout, stderr = common.run_os_command(
f"ip -json link show dev {config_networks['bridge_dev']}"
)
current_bridge_mtu = loads(stdout)[0]["mtu"]
print(
f"Config key bridge_mtu not explicitly set; using live MTU {current_bridge_mtu} from {config_networks['bridge_dev']}"
)
config_networks["bridge_mtu"] = current_bridge_mtu
config = {**config, **config_networks}
for network_type in ['cluster', 'storage', 'upstream']:
for network_type in ["cluster", "storage", "upstream"]:
result, msg = validate_floating_ip(config, network_type)
if not result:
raise MalformedConfigurationError(msg)
address_key = '{}_dev_ip'.format(network_type)
network_key = f'{network_type}_network'
address_key = "{}_dev_ip".format(network_type)
network_key = f"{network_type}_network"
network = ip_network(config[network_key])
# With autoselection of addresses, construct an IP from the relevant network
if config[address_key] == 'by-id':
if config[address_key] == "by-id":
# The NodeID starts at 1, but indexes start at 0
address_id = int(config['node_id']) - 1
address_id = int(config["node_id"]) - 1
# Grab the nth address from the network
config[address_key] = '{}/{}'.format(list(network.hosts())[address_id], network.prefixlen)
config[address_key] = "{}/{}".format(
list(network.hosts())[address_id], network.prefixlen
)
# Validate the provided IP instead
else:
try:
address = ip_address(config[address_key].split('/')[0])
address = ip_address(config[address_key].split("/")[0])
if address not in list(network.hosts()):
raise
except Exception:
raise MalformedConfigurationError(
f'IP address {config[address_key]} for {address_key} is not valid'
f"IP address {config[address_key]} for {address_key} is not valid"
)
# Get the PowerDNS aggregator database configuration
try:
o_pdnsdb = o_config['pvc']['coordinator']['dns']['database']
o_pdnsdb = o_config["pvc"]["coordinator"]["dns"]["database"]
except Exception as e:
raise MalformedConfigurationError(e)
config_pdnsdb = {
'pdns_postgresql_host': o_pdnsdb.get('host', None),
'pdns_postgresql_port': o_pdnsdb.get('port', None),
'pdns_postgresql_dbname': o_pdnsdb.get('name', None),
'pdns_postgresql_user': o_pdnsdb.get('user', None),
'pdns_postgresql_password': o_pdnsdb.get('pass', None),
"pdns_postgresql_host": o_pdnsdb.get("host", None),
"pdns_postgresql_port": o_pdnsdb.get("port", None),
"pdns_postgresql_dbname": o_pdnsdb.get("name", None),
"pdns_postgresql_user": o_pdnsdb.get("user", None),
"pdns_postgresql_password": o_pdnsdb.get("pass", None),
}
config = {**config, **config_pdnsdb}
# Get the Cloud-Init Metadata database configuration
try:
o_metadatadb = o_config['pvc']['coordinator']['metadata']['database']
o_metadatadb = o_config["pvc"]["coordinator"]["metadata"]["database"]
except Exception as e:
raise MalformedConfigurationError(e)
config_metadatadb = {
'metadata_postgresql_host': o_metadatadb.get('host', None),
'metadata_postgresql_port': o_metadatadb.get('port', None),
'metadata_postgresql_dbname': o_metadatadb.get('name', None),
'metadata_postgresql_user': o_metadatadb.get('user', None),
'metadata_postgresql_password': o_metadatadb.get('pass', None),
"metadata_postgresql_host": o_metadatadb.get("host", None),
"metadata_postgresql_port": o_metadatadb.get("port", None),
"metadata_postgresql_dbname": o_metadatadb.get("name", None),
"metadata_postgresql_user": o_metadatadb.get("user", None),
"metadata_postgresql_password": o_metadatadb.get("pass", None),
}
config = {**config, **config_metadatadb}
if config['enable_storage']:
if config["enable_storage"]:
# Get the storage configuration
try:
o_storage = o_config['pvc']['system']['configuration']['storage']
o_storage = o_config["pvc"]["system"]["configuration"]["storage"]
except Exception as e:
raise MalformedConfigurationError(e)
config_storage = {
'ceph_config_file': o_storage.get('ceph_config_file', None),
'ceph_admin_keyring': o_storage.get('ceph_admin_keyring', None),
"ceph_config_file": o_storage.get("ceph_config_file", None),
"ceph_admin_keyring": o_storage.get("ceph_admin_keyring", None),
}
config = {**config, **config_storage}
# Add our node static data to the config
config['static_data'] = get_static_data()
config["static_data"] = get_static_data()
return config
def validate_directories(config):
if not os.path.exists(config['dynamic_directory']):
os.makedirs(config['dynamic_directory'])
os.makedirs(config['dnsmasq_dynamic_directory'])
os.makedirs(config['pdns_dynamic_directory'])
os.makedirs(config['nft_dynamic_directory'])
if not os.path.exists(config["dynamic_directory"]):
os.makedirs(config["dynamic_directory"])
os.makedirs(config["dnsmasq_dynamic_directory"])
os.makedirs(config["pdns_dynamic_directory"])
os.makedirs(config["nft_dynamic_directory"])
if not os.path.exists(config['log_directory']):
os.makedirs(config['log_directory'])
os.makedirs(config['dnsmasq_log_directory'])
os.makedirs(config['pdns_log_directory'])
os.makedirs(config['nft_log_directory'])
if not os.path.exists(config["log_directory"]):
os.makedirs(config["log_directory"])
os.makedirs(config["dnsmasq_log_directory"])
os.makedirs(config["pdns_log_directory"])
os.makedirs(config["nft_log_directory"])

View File

@ -35,74 +35,89 @@ def fence_node(node_name, zkhandler, config, logger):
failcount = 0
while failcount < failcount_limit:
# Wait 5 seconds
time.sleep(config['keepalive_interval'])
time.sleep(config["keepalive_interval"])
# Get the state
node_daemon_state = zkhandler.read(('node.state.daemon', node_name))
node_daemon_state = zkhandler.read(("node.state.daemon", node_name))
# Is it still 'dead'
if node_daemon_state == 'dead':
if node_daemon_state == "dead":
failcount += 1
logger.out('Node "{}" failed {}/{} saving throws'.format(node_name, failcount, failcount_limit), state='s')
logger.out(
'Node "{}" failed {}/{} saving throws'.format(
node_name, failcount, failcount_limit
),
state="s",
)
# It changed back to something else so it must be alive
else:
logger.out('Node "{}" passed a saving throw; canceling fence'.format(node_name), state='o')
logger.out(
'Node "{}" passed a saving throw; canceling fence'.format(node_name),
state="o",
)
return
logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state='s')
logger.out('Fencing node "{}" via IPMI reboot signal'.format(node_name), state="s")
# Get IPMI information
ipmi_hostname = zkhandler.read(('node.ipmi.hostname', node_name))
ipmi_username = zkhandler.read(('node.ipmi.username', node_name))
ipmi_password = zkhandler.read(('node.ipmi.password', node_name))
ipmi_hostname = zkhandler.read(("node.ipmi.hostname", node_name))
ipmi_username = zkhandler.read(("node.ipmi.username", node_name))
ipmi_password = zkhandler.read(("node.ipmi.password", node_name))
# Shoot it in the head
fence_status = reboot_via_ipmi(ipmi_hostname, ipmi_username, ipmi_password, logger)
# Hold to ensure the fence takes effect and system stabilizes
logger.out('Waiting {}s for fence of node "{}" to take effect'.format(config['keepalive_interval'], node_name), state='i')
time.sleep(config['keepalive_interval'])
logger.out(
'Waiting {}s for fence of node "{}" to take effect'.format(
config["keepalive_interval"], node_name
),
state="i",
)
time.sleep(config["keepalive_interval"])
if fence_status:
logger.out('Marking node "{}" as fenced'.format(node_name), state='i')
logger.out('Marking node "{}" as fenced'.format(node_name), state="i")
while True:
try:
zkhandler.write([
(('node.state.daemon', node_name), 'fenced')
])
zkhandler.write([(("node.state.daemon", node_name), "fenced")])
break
except Exception:
continue
# Force into secondary network state if needed
if node_name in config['coordinators']:
logger.out('Forcing secondary status for node "{}"'.format(node_name), state='i')
zkhandler.write([
(('node.state.router', node_name), 'secondary')
])
if zkhandler.read('base.config.primary_node') == node_name:
zkhandler.write([
('base.config.primary_node', 'none')
])
if node_name in config["coordinators"]:
logger.out(
'Forcing secondary status for node "{}"'.format(node_name), state="i"
)
zkhandler.write([(("node.state.router", node_name), "secondary")])
if zkhandler.read("base.config.primary_node") == node_name:
zkhandler.write([("base.config.primary_node", "none")])
# If the fence succeeded and successful_fence is migrate
if fence_status and config['successful_fence'] == 'migrate':
if fence_status and config["successful_fence"] == "migrate":
migrateFromFencedNode(zkhandler, node_name, config, logger)
# If the fence failed and failed_fence is migrate
if not fence_status and config['failed_fence'] == 'migrate' and config['suicide_intervals'] != '0':
if (
not fence_status
and config["failed_fence"] == "migrate"
and config["suicide_intervals"] != "0"
):
migrateFromFencedNode(zkhandler, node_name, config, logger)
# Migrate hosts away from a fenced node
def migrateFromFencedNode(zkhandler, node_name, config, logger):
logger.out('Migrating VMs from dead node "{}" to new hosts'.format(node_name), state='i')
logger.out(
'Migrating VMs from dead node "{}" to new hosts'.format(node_name), state="i"
)
# Get the list of VMs
dead_node_running_domains = zkhandler.read(('node.running_domains', node_name)).split()
dead_node_running_domains = zkhandler.read(
("node.running_domains", node_name)
).split()
# Set the node to a custom domainstate so we know what's happening
zkhandler.write([
(('node.state.domain', node_name), 'fence-flush')
])
zkhandler.write([(("node.state.domain", node_name), "fence-flush")])
# Migrate a VM after a flush
def fence_migrate_vm(dom_uuid):
@ -111,28 +126,40 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
target_node = common.findTargetNode(zkhandler, dom_uuid)
if target_node is not None:
logger.out('Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node), state='i')
zkhandler.write([
(('domain.state', dom_uuid), 'start'),
(('domain.node', dom_uuid), target_node),
(('domain.last_node', dom_uuid), node_name),
])
logger.out(
'Migrating VM "{}" to node "{}"'.format(dom_uuid, target_node),
state="i",
)
zkhandler.write(
[
(("domain.state", dom_uuid), "start"),
(("domain.node", dom_uuid), target_node),
(("domain.last_node", dom_uuid), node_name),
]
)
else:
logger.out('No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format(dom_uuid), state='i')
zkhandler.write({
(('domain.state', dom_uuid), 'stopped'),
(('domain.meta.autostart', dom_uuid), 'True'),
})
logger.out(
'No target node found for VM "{}"; VM will autostart on next unflush/ready of current node'.format(
dom_uuid
),
state="i",
)
zkhandler.write(
{
(("domain.state", dom_uuid), "stopped"),
(("domain.meta.autostart", dom_uuid), "True"),
}
)
# Loop through the VMs
for dom_uuid in dead_node_running_domains:
fence_migrate_vm(dom_uuid)
# Set node in flushed state for easy remigrating when it comes back
zkhandler.write([
(('node.state.domain', node_name), 'flushed')
])
logger.out('All VMs flushed from dead node "{}" to new hosts'.format(node_name), state='i')
zkhandler.write([(("node.state.domain", node_name), "flushed")])
logger.out(
'All VMs flushed from dead node "{}" to new hosts'.format(node_name), state="i"
)
#
@ -140,68 +167,100 @@ def migrateFromFencedNode(zkhandler, node_name, config, logger):
#
def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger):
# Power off the node the node
logger.out('Sending power off to dead node', state='i')
ipmi_command_stop = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power off'.format(
ipmi_hostname, ipmi_user, ipmi_password
logger.out("Sending power off to dead node", state="i")
ipmi_command_stop = (
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power off".format(
ipmi_hostname, ipmi_user, ipmi_password
)
)
ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command(
ipmi_command_stop
)
ipmi_stop_retcode, ipmi_stop_stdout, ipmi_stop_stderr = common.run_os_command(ipmi_command_stop)
if ipmi_stop_retcode != 0:
logger.out(f'Failed to power off dead node: {ipmi_stop_stderr}', state='e')
logger.out(f"Failed to power off dead node: {ipmi_stop_stderr}", state="e")
time.sleep(5)
# Check the chassis power state
logger.out('Checking power state of dead node', state='i')
ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(
ipmi_hostname, ipmi_user, ipmi_password
logger.out("Checking power state of dead node", state="i")
ipmi_command_status = (
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format(
ipmi_hostname, ipmi_user, ipmi_password
)
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
ipmi_command_status
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status)
if ipmi_status_retcode == 0:
logger.out(f'Current chassis power state is: {ipmi_status_stdout.strip()}', state='i')
logger.out(
f"Current chassis power state is: {ipmi_status_stdout.strip()}", state="i"
)
else:
logger.out(f'Current chassis power state is: Unknown', state='w')
logger.out(f"Current chassis power state is: Unknown", state="w")
# Power on the node
logger.out('Sending power on to dead node', state='i')
ipmi_command_start = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on'.format(
ipmi_hostname, ipmi_user, ipmi_password
logger.out("Sending power on to dead node", state="i")
ipmi_command_start = (
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power on".format(
ipmi_hostname, ipmi_user, ipmi_password
)
)
ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(
ipmi_command_start
)
ipmi_start_retcode, ipmi_start_stdout, ipmi_start_stderr = common.run_os_command(ipmi_command_start)
if ipmi_start_retcode != 0:
logger.out(f'Failed to power on dead node: {ipmi_start_stderr}', state='w')
logger.out(f"Failed to power on dead node: {ipmi_start_stderr}", state="w")
time.sleep(2)
# Check the chassis power state
logger.out('Checking power state of dead node', state='i')
ipmi_command_status = '/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status'.format(
ipmi_hostname, ipmi_user, ipmi_password
logger.out("Checking power state of dead node", state="i")
ipmi_command_status = (
"/usr/bin/ipmitool -I lanplus -H {} -U {} -P {} chassis power status".format(
ipmi_hostname, ipmi_user, ipmi_password
)
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
ipmi_command_status
)
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(ipmi_command_status)
if ipmi_stop_retcode == 0:
if ipmi_status_stdout.strip() == "Chassis Power is on":
# We successfully rebooted the node and it is powered on; this is a succeessful fence
logger.out('Successfully rebooted dead node', state='o')
logger.out("Successfully rebooted dead node", state="o")
return True
elif ipmi_status_stdout.strip() == "Chassis Power is off":
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
logger.out('Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush', state='o')
logger.out(
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence-flush",
state="o",
)
return True
else:
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
logger.out('Chassis power is in an unknown state ({}) after successful IPMI reboot; not performing fence-flush'.format(ipmi_status_stdout.strip()), state='e')
logger.out(
"Chassis power is in an unknown state ({}) after successful IPMI reboot; not performing fence-flush".format(
ipmi_status_stdout.strip()
),
state="e",
)
return False
else:
if ipmi_status_stdout.strip() == "Chassis Power is off":
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
logger.out('Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush', state='o')
logger.out(
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence-flush",
state="o",
)
return True
else:
# We failed to reboot the node but it is in some unknown power state (including "on"); since this might indicate a silent failure, we must call it a failed fence
logger.out('Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush', state='e')
logger.out(
"Chassis power is not in confirmed off state after failed IPMI reboot; not performing fence-flush",
state="e",
)
return False
@ -209,7 +268,7 @@ def reboot_via_ipmi(ipmi_hostname, ipmi_user, ipmi_password, logger):
# Verify that IPMI connectivity to this host exists (used during node init)
#
def verify_ipmi(ipmi_hostname, ipmi_user, ipmi_password):
ipmi_command = f'/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status'
ipmi_command = f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
retcode, stdout, stderr = common.run_os_command(ipmi_command, timeout=2)
if retcode == 0 and stdout.strip() == "Chassis Power is on":
return True

File diff suppressed because it is too large Load Diff

View File

@ -23,14 +23,14 @@ import libvirt
def validate_libvirtd(logger, config):
if config['enable_hypervisor']:
if config["enable_hypervisor"]:
libvirt_check_name = f'qemu+tcp://{config["node_hostname"]}/system'
logger.out(f'Connecting to Libvirt daemon at {libvirt_check_name}', state='i')
logger.out(f"Connecting to Libvirt daemon at {libvirt_check_name}", state="i")
try:
lv_conn = libvirt.open(libvirt_check_name)
lv_conn.close()
except Exception as e:
logger.out(f'Failed to connect to Libvirt daemon: {e}', state='e')
logger.out(f"Failed to connect to Libvirt daemon: {e}", state="e")
return False
return True

View File

@ -26,141 +26,192 @@ from os import makedirs
def setup_sriov(logger, config):
logger.out('Setting up SR-IOV device support', state='i')
logger.out("Setting up SR-IOV device support", state="i")
# Enable unsafe interrupts for the vfio_iommu_type1 kernel module
try:
common.run_os_command('modprobe vfio_iommu_type1 allow_unsafe_interrupts=1')
with open('/sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts', 'w') as mfh:
mfh.write('Y')
common.run_os_command("modprobe vfio_iommu_type1 allow_unsafe_interrupts=1")
with open(
"/sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts", "w"
) as mfh:
mfh.write("Y")
except Exception:
logger.out('Failed to enable vfio_iommu_type1 kernel module; SR-IOV may fail', state='w')
logger.out(
"Failed to enable vfio_iommu_type1 kernel module; SR-IOV may fail",
state="w",
)
# Loop through our SR-IOV NICs and enable the numvfs for each
for device in config['sriov_device']:
logger.out(f'Preparing SR-IOV PF {device["phy"]} with {device["vfcount"]} VFs', state='i')
for device in config["sriov_device"]:
logger.out(
f'Preparing SR-IOV PF {device["phy"]} with {device["vfcount"]} VFs',
state="i",
)
try:
with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', 'r') as vfh:
with open(
f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', "r"
) as vfh:
current_vf_count = vfh.read().strip()
with open(f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', 'w') as vfh:
vfh.write(str(device['vfcount']))
with open(
f'/sys/class/net/{device["phy"]}/device/sriov_numvfs', "w"
) as vfh:
vfh.write(str(device["vfcount"]))
except FileNotFoundError:
logger.out(f'Failed to open SR-IOV configuration for PF {device["phy"]}; device may not support SR-IOV', state='w')
logger.out(
f'Failed to open SR-IOV configuration for PF {device["phy"]}; device may not support SR-IOV',
state="w",
)
except OSError:
logger.out(f'Failed to set SR-IOV VF count for PF {device["phy"]} to {device["vfcount"]}; already set to {current_vf_count}', state='w')
logger.out(
f'Failed to set SR-IOV VF count for PF {device["phy"]} to {device["vfcount"]}; already set to {current_vf_count}',
state="w",
)
if device.get('mtu', None) is not None:
logger.out(f'Setting SR-IOV PF {device["phy"]} to MTU {device["mtu"]}', state='i')
if device.get("mtu", None) is not None:
logger.out(
f'Setting SR-IOV PF {device["phy"]} to MTU {device["mtu"]}', state="i"
)
common.run_os_command(f'ip link set {device["phy"]} mtu {device["mtu"]} up')
def setup_interfaces(logger, config):
# Set up the Cluster interface
cluster_dev = config['cluster_dev']
cluster_mtu = config['cluster_mtu']
cluster_dev_ip = config['cluster_dev_ip']
cluster_dev = config["cluster_dev"]
cluster_mtu = config["cluster_mtu"]
cluster_dev_ip = config["cluster_dev_ip"]
logger.out(f'Setting up Cluster network interface {cluster_dev} with MTU {cluster_mtu}', state='i')
logger.out(
f"Setting up Cluster network interface {cluster_dev} with MTU {cluster_mtu}",
state="i",
)
common.run_os_command(f'ip link set {cluster_dev} mtu {cluster_mtu} up')
common.run_os_command(f"ip link set {cluster_dev} mtu {cluster_mtu} up")
logger.out(f'Setting up Cluster network bridge on interface {cluster_dev} with IP {cluster_dev_ip}', state='i')
logger.out(
f"Setting up Cluster network bridge on interface {cluster_dev} with IP {cluster_dev_ip}",
state="i",
)
common.run_os_command(f'brctl addbr brcluster')
common.run_os_command(f'brctl addif brcluster {cluster_dev}')
common.run_os_command(f'ip link set brcluster mtu {cluster_mtu} up')
common.run_os_command(f'ip address add {cluster_dev_ip} dev brcluster')
common.run_os_command(f"brctl addbr brcluster")
common.run_os_command(f"brctl addif brcluster {cluster_dev}")
common.run_os_command(f"ip link set brcluster mtu {cluster_mtu} up")
common.run_os_command(f"ip address add {cluster_dev_ip} dev brcluster")
# Set up the Storage interface
storage_dev = config['storage_dev']
storage_mtu = config['storage_mtu']
storage_dev_ip = config['storage_dev_ip']
storage_dev = config["storage_dev"]
storage_mtu = config["storage_mtu"]
storage_dev_ip = config["storage_dev_ip"]
logger.out(f'Setting up Storage network interface {storage_dev} with MTU {storage_mtu}', state='i')
logger.out(
f"Setting up Storage network interface {storage_dev} with MTU {storage_mtu}",
state="i",
)
common.run_os_command(f'ip link set {storage_dev} mtu {storage_mtu} up')
common.run_os_command(f"ip link set {storage_dev} mtu {storage_mtu} up")
if storage_dev == cluster_dev:
if storage_dev_ip != cluster_dev_ip:
logger.out(f'Setting up Storage network on Cluster network bridge with IP {storage_dev_ip}', state='i')
logger.out(
f"Setting up Storage network on Cluster network bridge with IP {storage_dev_ip}",
state="i",
)
common.run_os_command(f'ip address add {storage_dev_ip} dev brcluster')
common.run_os_command(f"ip address add {storage_dev_ip} dev brcluster")
else:
logger.out(f'Setting up Storage network bridge on interface {storage_dev} with IP {storage_dev_ip}', state='i')
logger.out(
f"Setting up Storage network bridge on interface {storage_dev} with IP {storage_dev_ip}",
state="i",
)
common.run_os_command(f'brctl addbr brstorage')
common.run_os_command(f'brctl addif brstorage {storage_dev}')
common.run_os_command(f'ip link set brstorage mtu {storage_mtu} up')
common.run_os_command(f'ip address add {storage_dev_ip} dev brstorage')
common.run_os_command(f"brctl addbr brstorage")
common.run_os_command(f"brctl addif brstorage {storage_dev}")
common.run_os_command(f"ip link set brstorage mtu {storage_mtu} up")
common.run_os_command(f"ip address add {storage_dev_ip} dev brstorage")
# Set up the Upstream interface
upstream_dev = config['upstream_dev']
upstream_mtu = config['upstream_mtu']
upstream_dev_ip = config['upstream_dev_ip']
upstream_dev = config["upstream_dev"]
upstream_mtu = config["upstream_mtu"]
upstream_dev_ip = config["upstream_dev_ip"]
logger.out(f'Setting up Upstream network interface {upstream_dev} with MTU {upstream_mtu}', state='i')
logger.out(
f"Setting up Upstream network interface {upstream_dev} with MTU {upstream_mtu}",
state="i",
)
if upstream_dev == cluster_dev:
if upstream_dev_ip != cluster_dev_ip:
logger.out(f'Setting up Upstream network on Cluster network bridge with IP {upstream_dev_ip}', state='i')
logger.out(
f"Setting up Upstream network on Cluster network bridge with IP {upstream_dev_ip}",
state="i",
)
common.run_os_command(f'ip address add {upstream_dev_ip} dev brcluster')
common.run_os_command(f"ip address add {upstream_dev_ip} dev brcluster")
else:
logger.out(f'Setting up Upstream network bridge on interface {upstream_dev} with IP {upstream_dev_ip}', state='i')
logger.out(
f"Setting up Upstream network bridge on interface {upstream_dev} with IP {upstream_dev_ip}",
state="i",
)
common.run_os_command(f'brctl addbr brupstream')
common.run_os_command(f'brctl addif brupstream {upstream_dev}')
common.run_os_command(f'ip link set brupstream mtu {upstream_mtu} up')
common.run_os_command(f'ip address add {upstream_dev_ip} dev brupstream')
common.run_os_command(f"brctl addbr brupstream")
common.run_os_command(f"brctl addif brupstream {upstream_dev}")
common.run_os_command(f"ip link set brupstream mtu {upstream_mtu} up")
common.run_os_command(f"ip address add {upstream_dev_ip} dev brupstream")
upstream_gateway = config['upstream_gateway']
upstream_gateway = config["upstream_gateway"]
if upstream_gateway is not None:
logger.out(f'Setting up Upstream network default gateway IP {upstream_gateway}', state='i')
logger.out(
f"Setting up Upstream network default gateway IP {upstream_gateway}",
state="i",
)
if upstream_dev == cluster_dev:
common.run_os_command(f'ip route add default via {upstream_gateway} dev brcluster')
common.run_os_command(
f"ip route add default via {upstream_gateway} dev brcluster"
)
else:
common.run_os_command(f'ip route add default via {upstream_gateway} dev brupstream')
common.run_os_command(
f"ip route add default via {upstream_gateway} dev brupstream"
)
# Set up sysctl tweaks to optimize networking
# Enable routing functions
common.run_os_command('sysctl net.ipv4.ip_forward=1')
common.run_os_command('sysctl net.ipv6.ip_forward=1')
common.run_os_command("sysctl net.ipv4.ip_forward=1")
common.run_os_command("sysctl net.ipv6.ip_forward=1")
# Enable send redirects
common.run_os_command('sysctl net.ipv4.conf.all.send_redirects=1')
common.run_os_command('sysctl net.ipv4.conf.default.send_redirects=1')
common.run_os_command('sysctl net.ipv6.conf.all.send_redirects=1')
common.run_os_command('sysctl net.ipv6.conf.default.send_redirects=1')
common.run_os_command("sysctl net.ipv4.conf.all.send_redirects=1")
common.run_os_command("sysctl net.ipv4.conf.default.send_redirects=1")
common.run_os_command("sysctl net.ipv6.conf.all.send_redirects=1")
common.run_os_command("sysctl net.ipv6.conf.default.send_redirects=1")
# Accept source routes
common.run_os_command('sysctl net.ipv4.conf.all.accept_source_route=1')
common.run_os_command('sysctl net.ipv4.conf.default.accept_source_route=1')
common.run_os_command('sysctl net.ipv6.conf.all.accept_source_route=1')
common.run_os_command('sysctl net.ipv6.conf.default.accept_source_route=1')
common.run_os_command("sysctl net.ipv4.conf.all.accept_source_route=1")
common.run_os_command("sysctl net.ipv4.conf.default.accept_source_route=1")
common.run_os_command("sysctl net.ipv6.conf.all.accept_source_route=1")
common.run_os_command("sysctl net.ipv6.conf.default.accept_source_route=1")
# Disable RP filtering on Cluster and Upstream interfaces (to allow traffic pivoting)
common.run_os_command(f'sysctl net.ipv4.conf.{cluster_dev}.rp_filter=0')
common.run_os_command(f'sysctl net.ipv4.conf.brcluster.rp_filter=0')
common.run_os_command(f'sysctl net.ipv4.conf.{upstream_dev}.rp_filter=0')
common.run_os_command(f'sysctl net.ipv4.conf.brupstream.rp_filter=0')
common.run_os_command(f'sysctl net.ipv6.conf.{cluster_dev}.rp_filter=0')
common.run_os_command(f'sysctl net.ipv6.conf.brcluster.rp_filter=0')
common.run_os_command(f'sysctl net.ipv6.conf.{upstream_dev}.rp_filter=0')
common.run_os_command(f'sysctl net.ipv6.conf.brupstream.rp_filter=0')
common.run_os_command(f"sysctl net.ipv4.conf.{cluster_dev}.rp_filter=0")
common.run_os_command(f"sysctl net.ipv4.conf.brcluster.rp_filter=0")
common.run_os_command(f"sysctl net.ipv4.conf.{upstream_dev}.rp_filter=0")
common.run_os_command(f"sysctl net.ipv4.conf.brupstream.rp_filter=0")
common.run_os_command(f"sysctl net.ipv6.conf.{cluster_dev}.rp_filter=0")
common.run_os_command(f"sysctl net.ipv6.conf.brcluster.rp_filter=0")
common.run_os_command(f"sysctl net.ipv6.conf.{upstream_dev}.rp_filter=0")
common.run_os_command(f"sysctl net.ipv6.conf.brupstream.rp_filter=0")
# Stop DNSMasq if it is running
common.run_os_command('systemctl stop dnsmasq.service')
common.run_os_command("systemctl stop dnsmasq.service")
logger.out('Waiting 3 seconds for networking to come up', state='s')
logger.out("Waiting 3 seconds for networking to come up", state="s")
sleep(3)
def create_nft_configuration(logger, config):
if config['enable_networking']:
logger.out('Creating NFT firewall configuration', state='i')
if config["enable_networking"]:
logger.out("Creating NFT firewall configuration", state="i")
dynamic_directory = config['nft_dynamic_directory']
dynamic_directory = config["nft_dynamic_directory"]
# Create directories
makedirs(f'{dynamic_directory}/networks', exist_ok=True)
makedirs(f'{dynamic_directory}/static', exist_ok=True)
makedirs(f"{dynamic_directory}/networks", exist_ok=True)
makedirs(f"{dynamic_directory}/static", exist_ok=True)
# Set up the base rules
nftables_base_rules = f"""# Base rules
@ -175,7 +226,7 @@ def create_nft_configuration(logger, config):
"""
# Write the base firewall config
nftables_base_filename = f'{dynamic_directory}/base.nft'
with open(nftables_base_filename, 'w') as nftfh:
nftables_base_filename = f"{dynamic_directory}/base.nft"
with open(nftables_base_filename, "w") as nftfh:
nftfh.write(nftables_base_rules)
common.reload_firewall_rules(nftables_base_filename, logger)

View File

@ -24,45 +24,49 @@ from time import sleep
def start_zookeeper(logger, config):
if config['daemon_mode'] == 'coordinator':
logger.out('Starting Zookeeper daemon', state='i')
if config["daemon_mode"] == "coordinator":
logger.out("Starting Zookeeper daemon", state="i")
# TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
common.run_os_command('systemctl start zookeeper.service')
common.run_os_command("systemctl start zookeeper.service")
def start_libvirtd(logger, config):
if config['enable_hypervisor']:
logger.out('Starting Libvirt daemon', state='i')
if config["enable_hypervisor"]:
logger.out("Starting Libvirt daemon", state="i")
# TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
common.run_os_command('systemctl start libvirtd.service')
common.run_os_command("systemctl start libvirtd.service")
def start_patroni(logger, config):
if config['enable_networking'] and config['daemon_mode'] == 'coordinator':
logger.out('Starting Patroni daemon', state='i')
if config["enable_networking"] and config["daemon_mode"] == "coordinator":
logger.out("Starting Patroni daemon", state="i")
# TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
common.run_os_command('systemctl start patroni.service')
common.run_os_command("systemctl start patroni.service")
def start_frrouting(logger, config):
if config['enable_networking'] and config['daemon_mode'] == 'coordinator':
logger.out('Starting FRRouting daemon', state='i')
if config["enable_networking"] and config["daemon_mode"] == "coordinator":
logger.out("Starting FRRouting daemon", state="i")
# TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
common.run_os_command('systemctl start frr.service')
common.run_os_command("systemctl start frr.service")
def start_ceph_mon(logger, config):
if config['enable_storage'] and config['daemon_mode'] == 'coordinator':
logger.out('Starting Ceph Monitor daemon', state='i')
if config["enable_storage"] and config["daemon_mode"] == "coordinator":
logger.out("Starting Ceph Monitor daemon", state="i")
# TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
common.run_os_command(f'systemctl start ceph-mon@{config["node_hostname"]}.service')
common.run_os_command(
f'systemctl start ceph-mon@{config["node_hostname"]}.service'
)
def start_ceph_mgr(logger, config):
if config['enable_storage'] and config['daemon_mode'] == 'coordinator':
logger.out('Starting Ceph Manager daemon', state='i')
if config["enable_storage"] and config["daemon_mode"] == "coordinator":
logger.out("Starting Ceph Manager daemon", state="i")
# TODO: Move our handling out of Systemd and integrate it directly as a subprocess?
common.run_os_command(f'systemctl start ceph-mgr@{config["node_hostname"]}.service')
common.run_os_command(
f'systemctl start ceph-mgr@{config["node_hostname"]}.service'
)
def start_system_services(logger, config):
@ -73,5 +77,5 @@ def start_system_services(logger, config):
start_ceph_mon(logger, config)
start_ceph_mgr(logger, config)
logger.out('Waiting 3 seconds for daemons to start', state='s')
logger.out("Waiting 3 seconds for daemons to start", state="s")
sleep(3)

View File

@ -31,45 +31,61 @@ def connect(logger, config):
zkhandler = ZKHandler(config, logger)
try:
logger.out('Connecting to Zookeeper on coordinator nodes {}'.format(config['coordinators']), state='i')
logger.out(
"Connecting to Zookeeper on coordinator nodes {}".format(
config["coordinators"]
),
state="i",
)
# Start connection
zkhandler.connect(persistent=True)
except Exception as e:
logger.out('ERROR: Failed to connect to Zookeeper cluster: {}'.format(e), state='e')
logger.out(
"ERROR: Failed to connect to Zookeeper cluster: {}".format(e), state="e"
)
os._exit(1)
logger.out('Validating Zookeeper schema', state='i')
logger.out("Validating Zookeeper schema", state="i")
try:
node_schema_version = int(zkhandler.read(('node.data.active_schema', config['node_hostname'])))
node_schema_version = int(
zkhandler.read(("node.data.active_schema", config["node_hostname"]))
)
except Exception:
node_schema_version = int(zkhandler.read('base.schema.version'))
zkhandler.write([
(('node.data.active_schema', config['node_hostname']), node_schema_version)
])
node_schema_version = int(zkhandler.read("base.schema.version"))
zkhandler.write(
[
(
("node.data.active_schema", config["node_hostname"]),
node_schema_version,
)
]
)
# Load in the current node schema version
zkhandler.schema.load(node_schema_version)
# Record the latest intalled schema version
latest_schema_version = zkhandler.schema.find_latest()
logger.out('Latest installed schema is {}'.format(latest_schema_version), state='i')
zkhandler.write([
(('node.data.latest_schema', config['node_hostname']), latest_schema_version)
])
logger.out("Latest installed schema is {}".format(latest_schema_version), state="i")
zkhandler.write(
[(("node.data.latest_schema", config["node_hostname"]), latest_schema_version)]
)
# If we are the last node to get a schema update, fire the master update
if latest_schema_version > node_schema_version:
node_latest_schema_version = list()
for node in zkhandler.children('base.node'):
node_latest_schema_version.append(int(zkhandler.read(('node.data.latest_schema', node))))
for node in zkhandler.children("base.node"):
node_latest_schema_version.append(
int(zkhandler.read(("node.data.latest_schema", node)))
)
# This is true if all elements of the latest schema version are identical to the latest version,
# i.e. they have all had the latest schema installed and ready to load.
if node_latest_schema_version.count(latest_schema_version) == len(node_latest_schema_version):
zkhandler.write([
('base.schema.version', latest_schema_version)
])
if node_latest_schema_version.count(latest_schema_version) == len(
node_latest_schema_version
):
zkhandler.write([("base.schema.version", latest_schema_version)])
return zkhandler, node_schema_version
@ -77,56 +93,95 @@ def connect(logger, config):
def validate_schema(logger, zkhandler):
# Validate our schema against the active version
if not zkhandler.schema.validate(zkhandler, logger):
logger.out('Found schema violations, applying', state='i')
logger.out("Found schema violations, applying", state="i")
zkhandler.schema.apply(zkhandler)
else:
logger.out('Schema successfully validated', state='o')
logger.out("Schema successfully validated", state="o")
def setup_node(logger, config, zkhandler):
# Check if our node exists in Zookeeper, and create it if not
if config['daemon_mode'] == 'coordinator':
init_routerstate = 'secondary'
if config["daemon_mode"] == "coordinator":
init_routerstate = "secondary"
else:
init_routerstate = 'client'
init_routerstate = "client"
if zkhandler.exists(('node', config['node_hostname'])):
logger.out(f'Node is {logger.fmt_green}present{logger.fmt_end} in Zookeeper', state='i')
if zkhandler.exists(("node", config["node_hostname"])):
logger.out(
f"Node is {logger.fmt_green}present{logger.fmt_end} in Zookeeper", state="i"
)
# Update static data just in case it's changed
zkhandler.write([
(('node', config['node_hostname']), config['daemon_mode']),
(('node.mode', config['node_hostname']), config['daemon_mode']),
(('node.state.daemon', config['node_hostname']), 'init'),
(('node.state.router', config['node_hostname']), init_routerstate),
(('node.data.static', config['node_hostname']), ' '.join(config['static_data'])),
(('node.data.pvc_version', config['node_hostname']), config['pvcnoded_version']),
(('node.ipmi.hostname', config['node_hostname']), config['ipmi_hostname']),
(('node.ipmi.username', config['node_hostname']), config['ipmi_username']),
(('node.ipmi.password', config['node_hostname']), config['ipmi_password']),
])
zkhandler.write(
[
(("node", config["node_hostname"]), config["daemon_mode"]),
(("node.mode", config["node_hostname"]), config["daemon_mode"]),
(("node.state.daemon", config["node_hostname"]), "init"),
(("node.state.router", config["node_hostname"]), init_routerstate),
(
("node.data.static", config["node_hostname"]),
" ".join(config["static_data"]),
),
(
("node.data.pvc_version", config["node_hostname"]),
config["pvcnoded_version"],
),
(
("node.ipmi.hostname", config["node_hostname"]),
config["ipmi_hostname"],
),
(
("node.ipmi.username", config["node_hostname"]),
config["ipmi_username"],
),
(
("node.ipmi.password", config["node_hostname"]),
config["ipmi_password"],
),
]
)
else:
logger.out(f'Node is {logger.fmt_red}absent{logger.fmt_end} in Zookeeper; adding new node', state='i')
logger.out(
f"Node is {logger.fmt_red}absent{logger.fmt_end} in Zookeeper; adding new node",
state="i",
)
keepalive_time = int(time.time())
zkhandler.write([
(('node', config['node_hostname']), config['daemon_mode']),
(('node.keepalive', config['node_hostname']), str(keepalive_time)),
(('node.mode', config['node_hostname']), config['daemon_mode']),
(('node.state.daemon', config['node_hostname']), 'init'),
(('node.state.domain', config['node_hostname']), 'flushed'),
(('node.state.router', config['node_hostname']), init_routerstate),
(('node.data.static', config['node_hostname']), ' '.join(config['static_data'])),
(('node.data.pvc_version', config['node_hostname']), config['pvcnoded_version']),
(('node.ipmi.hostname', config['node_hostname']), config['ipmi_hostname']),
(('node.ipmi.username', config['node_hostname']), config['ipmi_username']),
(('node.ipmi.password', config['node_hostname']), config['ipmi_password']),
(('node.memory.total', config['node_hostname']), '0'),
(('node.memory.used', config['node_hostname']), '0'),
(('node.memory.free', config['node_hostname']), '0'),
(('node.memory.allocated', config['node_hostname']), '0'),
(('node.memory.provisioned', config['node_hostname']), '0'),
(('node.vcpu.allocated', config['node_hostname']), '0'),
(('node.cpu.load', config['node_hostname']), '0.0'),
(('node.running_domains', config['node_hostname']), '0'),
(('node.count.provisioned_domains', config['node_hostname']), '0'),
(('node.count.networks', config['node_hostname']), '0'),
])
zkhandler.write(
[
(("node", config["node_hostname"]), config["daemon_mode"]),
(("node.keepalive", config["node_hostname"]), str(keepalive_time)),
(("node.mode", config["node_hostname"]), config["daemon_mode"]),
(("node.state.daemon", config["node_hostname"]), "init"),
(("node.state.domain", config["node_hostname"]), "flushed"),
(("node.state.router", config["node_hostname"]), init_routerstate),
(
("node.data.static", config["node_hostname"]),
" ".join(config["static_data"]),
),
(
("node.data.pvc_version", config["node_hostname"]),
config["pvcnoded_version"],
),
(
("node.ipmi.hostname", config["node_hostname"]),
config["ipmi_hostname"],
),
(
("node.ipmi.username", config["node_hostname"]),
config["ipmi_username"],
),
(
("node.ipmi.password", config["node_hostname"]),
config["ipmi_password"],
),
(("node.memory.total", config["node_hostname"]), "0"),
(("node.memory.used", config["node_hostname"]), "0"),
(("node.memory.free", config["node_hostname"]), "0"),
(("node.memory.allocated", config["node_hostname"]), "0"),
(("node.memory.provisioned", config["node_hostname"]), "0"),
(("node.vcpu.allocated", config["node_hostname"]), "0"),
(("node.cpu.load", config["node_hostname"]), "0.0"),
(("node.running_domains", config["node_hostname"]), "0"),
(("node.count.provisioned_domains", config["node_hostname"]), "0"),
(("node.count.networks", config["node_hostname"]), "0"),
]
)