Compare commits
17 Commits
Author | SHA1 | Date | |
---|---|---|---|
1aa5999109 | |||
570460e5ee | |||
7a99e0e524 | |||
234d6ae83b | |||
5d0e7931d1 | |||
dcb9c0d12c | |||
f6e856bf98 | |||
f1fe0c63f5 | |||
ab944f9b95 | |||
9714ac20b2 | |||
79ad09ae59 | |||
4c6aabec6a | |||
559400ed90 | |||
78c774b607 | |||
a461791ce8 | |||
9fdb6d8708 | |||
2fb7c40497 |
21
CHANGELOG.md
21
CHANGELOG.md
@ -1,5 +1,26 @@
|
||||
## PVC Changelog
|
||||
|
||||
###### [v0.9.98](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.98)
|
||||
|
||||
* [CLI Client] Fixed output when API call times out
|
||||
* [Node Daemon] Improves the handling of fence states
|
||||
* [API Daemon/CLI Client] Adds support for storage snapshot rollback
|
||||
* [CLI Client] Adds additional warning messages about snapshot consistency to help output
|
||||
* [API Daemon] Fixes a bug listing snapshots by pool/volume
|
||||
* [Node Daemon] Adds a --version flag for information gathering by update-motd.sh
|
||||
|
||||
###### [v0.9.97](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.97)
|
||||
|
||||
* [Client CLI] Ensures --lines is always an integer value
|
||||
* [Node Daemon] Fixes a bug if d_network changes during iteration
|
||||
* [Node Daemon] Moves to using allocated instead of free memory for node reporting
|
||||
* [API Daemon] Fixes a bug if lingering RBD snapshots exist when removing a volume (#180)
|
||||
|
||||
###### [v0.9.96](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.96)
|
||||
|
||||
* [API Daemon] Fixes a bug when reporting node stats
|
||||
* [API Daemon] Fixes a bug deleteing successful benchmark results
|
||||
|
||||
###### [v0.9.95](https://github.com/parallelvirtualcluster/pvc/releases/tag/v0.9.95)
|
||||
|
||||
* [API Daemon/CLI Client] Adds a flag to allow duplicate VNIs in network templates
|
||||
|
@ -27,7 +27,7 @@ from distutils.util import strtobool as dustrtobool
|
||||
import daemon_lib.config as cfg
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.95"
|
||||
version = "0.9.98"
|
||||
|
||||
# API version
|
||||
API_VERSION = 1.0
|
||||
|
@ -6362,6 +6362,59 @@ api.add_resource(
|
||||
)
|
||||
|
||||
|
||||
# /storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback
|
||||
class API_Storage_Ceph_Snapshot_Rollback_Element(Resource):
|
||||
@Authenticator
|
||||
def post(self, pool, volume, snapshot):
|
||||
"""
|
||||
Roll back an RBD volume {volume} in pool {pool} to snapshot {snapshot}
|
||||
|
||||
WARNING: This action cannot be done on an active RBD volume. All IO MUST be stopped first.
|
||||
---
|
||||
tags:
|
||||
- storage / ceph
|
||||
parameters:
|
||||
- in: query
|
||||
name: snapshot
|
||||
type: string
|
||||
required: true
|
||||
description: The name of the snapshot
|
||||
- in: query
|
||||
name: volume
|
||||
type: string
|
||||
required: true
|
||||
description: The name of the volume
|
||||
- in: query
|
||||
name: pool
|
||||
type: integer
|
||||
required: true
|
||||
description: The name of the pool
|
||||
responses:
|
||||
200:
|
||||
description: OK
|
||||
schema:
|
||||
type: object
|
||||
id: Message
|
||||
404:
|
||||
description: Not found
|
||||
schema:
|
||||
type: object
|
||||
id: Message
|
||||
400:
|
||||
description: Bad request
|
||||
schema:
|
||||
type: object
|
||||
id: Message
|
||||
"""
|
||||
return api_helper.ceph_volume_snapshot_rollback(pool, volume, snapshot)
|
||||
|
||||
|
||||
api.add_resource(
|
||||
API_Storage_Ceph_Snapshot_Rollback_Element,
|
||||
"/storage/ceph/snapshot/<pool>/<volume>/<snapshot>/rollback",
|
||||
)
|
||||
|
||||
|
||||
##########################################################
|
||||
# Provisioner API
|
||||
##########################################################
|
||||
|
@ -2183,6 +2183,22 @@ def ceph_volume_snapshot_rename(zkhandler, pool, volume, name, new_name):
|
||||
return output, retcode
|
||||
|
||||
|
||||
@ZKConnection(config)
|
||||
def ceph_volume_snapshot_rollback(zkhandler, pool, volume, name):
|
||||
"""
|
||||
Roll back a Ceph RBD volume to a given snapshot in the PVC Ceph storage cluster.
|
||||
"""
|
||||
retflag, retdata = pvc_ceph.rollback_snapshot(zkhandler, pool, volume, name)
|
||||
|
||||
if retflag:
|
||||
retcode = 200
|
||||
else:
|
||||
retcode = 400
|
||||
|
||||
output = {"message": retdata.replace('"', "'")}
|
||||
return output, retcode
|
||||
|
||||
|
||||
@ZKConnection(config)
|
||||
def ceph_volume_snapshot_remove(zkhandler, pool, volume, name):
|
||||
"""
|
||||
|
@ -671,9 +671,9 @@ def cli_cluster_maintenance_off():
|
||||
@format_opt(
|
||||
{
|
||||
"pretty": cli_cluster_task_format_pretty,
|
||||
"raw": lambda d: "\n".join([t["id"] for t in d])
|
||||
if isinstance(d, list)
|
||||
else d["state"],
|
||||
"raw": lambda d: (
|
||||
"\n".join([t["id"] for t in d]) if isinstance(d, list) else d["state"]
|
||||
),
|
||||
"json": lambda d: jdumps(d),
|
||||
"json-pretty": lambda d: jdumps(d, indent=2),
|
||||
}
|
||||
@ -892,6 +892,7 @@ def cli_node_ready(
|
||||
"--lines",
|
||||
"lines",
|
||||
default=None,
|
||||
type=int,
|
||||
show_default=False,
|
||||
help="Display this many log lines from the end of the log buffer. [default: 1000; with follow: 10]",
|
||||
)
|
||||
@ -2516,6 +2517,7 @@ def cli_vm_volume_remove(domain, volume, live_flag, restart_flag):
|
||||
"--lines",
|
||||
"lines",
|
||||
default=None,
|
||||
type=int,
|
||||
show_default=False,
|
||||
help="Display this many log lines from the end of the log buffer. [default: 1000; with follow: 10]",
|
||||
)
|
||||
@ -4323,6 +4325,10 @@ def cli_storage_volume_snapshot():
|
||||
def cli_storage_volume_snapshot_add(pool, volume, name):
|
||||
"""
|
||||
Add a snapshot with name NAME of Ceph RBD volume VOLUME in pool POOL.
|
||||
|
||||
WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
|
||||
of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
|
||||
VM at the moment of the snapshot.
|
||||
"""
|
||||
|
||||
retcode, retmsg = pvc.lib.storage.ceph_snapshot_add(CLI_CONFIG, pool, volume, name)
|
||||
@ -4370,6 +4376,36 @@ def cli_storage_volume_snapshot_remove(pool, volume, name):
|
||||
finish(retcode, retmsg)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# > pvc storage volume snapshot rollback
|
||||
###############################################################################
|
||||
@click.command(name="rollback", short_help="Roll back RBD volume to snapshot.")
|
||||
@connection_req
|
||||
@click.argument("pool")
|
||||
@click.argument("volume")
|
||||
@click.argument("name")
|
||||
@confirm_opt("Roll back to snapshot {name} for volume {pool}/{volume}")
|
||||
def cli_storage_volume_snapshot_rollback(pool, volume, name):
|
||||
"""
|
||||
Roll back the Ceph RBD volume VOLUME in pool POOL to the snapshot NAME.
|
||||
|
||||
DANGER: All data written to the volume since the given snapshot will be permanently lost.
|
||||
|
||||
WARNING: A rollback cannot be performed on an RBD volume with active I/O. Doing so will cause
|
||||
undefined behaviour and possible corruption. Ensure that any VM(s) using this RBD volume are
|
||||
stopped or disabled before attempting a snapshot rollback.
|
||||
|
||||
WARNING: RBD snapshots are crash-consistent but not filesystem-aware. If a snapshot was taken
|
||||
of a running VM, restoring that snapshot will be equivalent to having forcibly restarted the
|
||||
VM at the moment of the snapshot.
|
||||
"""
|
||||
|
||||
retcode, retmsg = pvc.lib.storage.ceph_snapshot_rollback(
|
||||
CLI_CONFIG, pool, volume, name
|
||||
)
|
||||
finish(retcode, retmsg)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# > pvc storage volume snapshot list
|
||||
###############################################################################
|
||||
@ -6347,6 +6383,7 @@ cli_storage_volume.add_command(cli_storage_volume_list)
|
||||
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_add)
|
||||
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rename)
|
||||
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_remove)
|
||||
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_rollback)
|
||||
cli_storage_volume_snapshot.add_command(cli_storage_volume_snapshot_list)
|
||||
cli_storage_volume.add_command(cli_storage_volume_snapshot)
|
||||
cli_storage.add_command(cli_storage_volume)
|
||||
|
@ -580,9 +580,11 @@ def cli_cluster_fault_list_format_long(CLI_CONFIG, fault_data):
|
||||
fault_id=fault["id"],
|
||||
fault_status=fault["status"].title(),
|
||||
fault_health_delta=f"-{fault['health_delta']}%",
|
||||
fault_acknowledged_at=fault["acknowledged_at"]
|
||||
if fault["acknowledged_at"] != ""
|
||||
else "N/A",
|
||||
fault_acknowledged_at=(
|
||||
fault["acknowledged_at"]
|
||||
if fault["acknowledged_at"] != ""
|
||||
else "N/A"
|
||||
),
|
||||
fault_last_reported=fault["last_reported"],
|
||||
fault_first_reported=fault["first_reported"],
|
||||
)
|
||||
|
@ -108,9 +108,10 @@ class UploadProgressBar(object):
|
||||
|
||||
|
||||
class ErrorResponse(requests.Response):
|
||||
def __init__(self, json_data, status_code):
|
||||
def __init__(self, json_data, status_code, headers):
|
||||
self.json_data = json_data
|
||||
self.status_code = status_code
|
||||
self.headers = headers
|
||||
|
||||
def json(self):
|
||||
return self.json_data
|
||||
@ -206,7 +207,7 @@ def call_api(
|
||||
except Exception as e:
|
||||
message = "Failed to connect to the API: {}".format(e)
|
||||
code = response.status_code if response else 504
|
||||
response = ErrorResponse({"message": message}, code)
|
||||
response = ErrorResponse({"message": message}, code, None)
|
||||
|
||||
# Display debug output
|
||||
if config["debug"]:
|
||||
|
@ -1544,6 +1544,30 @@ def ceph_snapshot_add(config, pool, volume, snapshot):
|
||||
return retstatus, response.json().get("message", "")
|
||||
|
||||
|
||||
def ceph_snapshot_rollback(config, pool, volume, snapshot):
|
||||
"""
|
||||
Roll back Ceph volume to snapshot
|
||||
|
||||
API endpoint: POST /api/v1/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback
|
||||
API arguments:
|
||||
API schema: {"message":"{data}"}
|
||||
"""
|
||||
response = call_api(
|
||||
config,
|
||||
"post",
|
||||
"/storage/ceph/snapshot/{pool}/{volume}/{snapshot}/rollback".format(
|
||||
snapshot=snapshot, volume=volume, pool=pool
|
||||
),
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
retstatus = True
|
||||
else:
|
||||
retstatus = False
|
||||
|
||||
return retstatus, response.json().get("message", "")
|
||||
|
||||
|
||||
def ceph_snapshot_remove(config, pool, volume, snapshot):
|
||||
"""
|
||||
Remove Ceph snapshot
|
||||
|
@ -1765,9 +1765,9 @@ def format_info(config, domain_information, long_output):
|
||||
tags_name=tag["name"],
|
||||
tags_type=tag["type"],
|
||||
tags_protected=str(tag["protected"]),
|
||||
tags_protected_colour=ansiprint.green()
|
||||
if tag["protected"]
|
||||
else ansiprint.blue(),
|
||||
tags_protected_colour=(
|
||||
ansiprint.green() if tag["protected"] else ansiprint.blue()
|
||||
),
|
||||
end=ansiprint.end(),
|
||||
)
|
||||
)
|
||||
|
@ -2,7 +2,7 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="pvc",
|
||||
version="0.9.95",
|
||||
version="0.9.98",
|
||||
packages=["pvc.cli", "pvc.lib"],
|
||||
install_requires=[
|
||||
"Click",
|
||||
|
@ -115,12 +115,13 @@ class BenchmarkError(Exception):
|
||||
#
|
||||
|
||||
|
||||
def cleanup(job_name, db_conn=None, db_cur=None, zkhandler=None):
|
||||
def cleanup(job_name, db_conn=None, db_cur=None, zkhandler=None, final=False):
|
||||
if db_conn is not None and db_cur is not None:
|
||||
# Clean up our dangling result
|
||||
query = "DELETE FROM storage_benchmarks WHERE job = %s;"
|
||||
args = (job_name,)
|
||||
db_cur.execute(query, args)
|
||||
if not final:
|
||||
# Clean up our dangling result (non-final runs only)
|
||||
query = "DELETE FROM storage_benchmarks WHERE job = %s;"
|
||||
args = (job_name,)
|
||||
db_cur.execute(query, args)
|
||||
db_conn.commit()
|
||||
# Close the database connections cleanly
|
||||
close_database(db_conn, db_cur)
|
||||
@ -410,6 +411,7 @@ def worker_run_benchmark(zkhandler, celery, config, pool):
|
||||
db_conn=db_conn,
|
||||
db_cur=db_cur,
|
||||
zkhandler=zkhandler,
|
||||
final=True,
|
||||
)
|
||||
|
||||
current_stage += 1
|
||||
|
@ -320,7 +320,11 @@ def get_list_osd(zkhandler, limit=None, is_fuzzy=True):
|
||||
#
|
||||
def getPoolInformation(zkhandler, pool):
|
||||
# Parse the stats data
|
||||
(pool_stats_raw, tier, pgs,) = zkhandler.read_many(
|
||||
(
|
||||
pool_stats_raw,
|
||||
tier,
|
||||
pgs,
|
||||
) = zkhandler.read_many(
|
||||
[
|
||||
("pool.stats", pool),
|
||||
("pool.tier", pool),
|
||||
@ -536,7 +540,10 @@ def getCephVolumes(zkhandler, pool):
|
||||
pool_list = [pool]
|
||||
|
||||
for pool_name in pool_list:
|
||||
for volume_name in zkhandler.children(("volume", pool_name)):
|
||||
children = zkhandler.children(("volume", pool_name))
|
||||
if children is None:
|
||||
continue
|
||||
for volume_name in children:
|
||||
volume_list.append("{}/{}".format(pool_name, volume_name))
|
||||
|
||||
return volume_list
|
||||
@ -824,10 +831,22 @@ def remove_volume(zkhandler, pool, name):
|
||||
name, pool
|
||||
)
|
||||
|
||||
# 1. Remove volume snapshots
|
||||
# 1a. Remove PVC-managed volume snapshots
|
||||
for snapshot in zkhandler.children(("snapshot", f"{pool}/{name}")):
|
||||
remove_snapshot(zkhandler, pool, name, snapshot)
|
||||
|
||||
# 1b. Purge any remaining volume snapshots
|
||||
retcode, stdout, stderr = common.run_os_command(
|
||||
"rbd snap purge {}/{}".format(pool, name)
|
||||
)
|
||||
if retcode:
|
||||
return (
|
||||
False,
|
||||
'ERROR: Failed to purge snapshots from RBD volume "{}" in pool "{}": {}'.format(
|
||||
name, pool, stderr
|
||||
),
|
||||
)
|
||||
|
||||
# 2. Remove the volume
|
||||
retcode, stdout, stderr = common.run_os_command("rbd rm {}/{}".format(pool, name))
|
||||
if retcode:
|
||||
@ -1066,6 +1085,36 @@ def rename_snapshot(zkhandler, pool, volume, name, new_name):
|
||||
)
|
||||
|
||||
|
||||
def rollback_snapshot(zkhandler, pool, volume, name):
|
||||
if not verifyVolume(zkhandler, pool, volume):
|
||||
return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
|
||||
volume, pool
|
||||
)
|
||||
if not verifySnapshot(zkhandler, pool, volume, name):
|
||||
return (
|
||||
False,
|
||||
'ERROR: No snapshot with name "{}" is present for volume "{}" in pool "{}".'.format(
|
||||
name, volume, pool
|
||||
),
|
||||
)
|
||||
|
||||
# 1. Roll back the snapshot
|
||||
retcode, stdout, stderr = common.run_os_command(
|
||||
"rbd snap rollback {}/{}@{}".format(pool, volume, name)
|
||||
)
|
||||
if retcode:
|
||||
return (
|
||||
False,
|
||||
'ERROR: Failed to roll back RBD volume "{}" in pool "{}" to snapshot "{}": {}'.format(
|
||||
volume, pool, name, stderr
|
||||
),
|
||||
)
|
||||
|
||||
return True, 'Rolled back RBD volume "{}" in pool "{}" to snapshot "{}".'.format(
|
||||
volume, pool, name
|
||||
)
|
||||
|
||||
|
||||
def remove_snapshot(zkhandler, pool, volume, name):
|
||||
if not verifyVolume(zkhandler, pool, volume):
|
||||
return False, 'ERROR: No volume with name "{}" is present in pool "{}".'.format(
|
||||
@ -1107,20 +1156,9 @@ def remove_snapshot(zkhandler, pool, volume, name):
|
||||
)
|
||||
|
||||
|
||||
def get_list_snapshot(zkhandler, pool, volume, limit=None, is_fuzzy=True):
|
||||
def get_list_snapshot(zkhandler, target_pool, target_volume, limit=None, is_fuzzy=True):
|
||||
snapshot_list = []
|
||||
if pool and not verifyPool(zkhandler, pool):
|
||||
return False, 'ERROR: No pool with name "{}" is present in the cluster.'.format(
|
||||
pool
|
||||
)
|
||||
|
||||
if volume and not verifyPool(zkhandler, volume):
|
||||
return (
|
||||
False,
|
||||
'ERROR: No volume with name "{}" is present in the cluster.'.format(volume),
|
||||
)
|
||||
|
||||
full_snapshot_list = getCephSnapshots(zkhandler, pool, volume)
|
||||
full_snapshot_list = getCephSnapshots(zkhandler, target_pool, target_volume)
|
||||
|
||||
if is_fuzzy and limit:
|
||||
# Implicitly assume fuzzy limits
|
||||
@ -1132,6 +1170,10 @@ def get_list_snapshot(zkhandler, pool, volume, limit=None, is_fuzzy=True):
|
||||
for snapshot in full_snapshot_list:
|
||||
volume, snapshot_name = snapshot.split("@")
|
||||
pool_name, volume_name = volume.split("/")
|
||||
if target_pool and pool_name != target_pool:
|
||||
continue
|
||||
if target_volume and volume_name != target_volume:
|
||||
continue
|
||||
if limit:
|
||||
try:
|
||||
if re.fullmatch(limit, snapshot_name):
|
||||
|
@ -244,9 +244,9 @@ def get_parsed_configuration(config_file):
|
||||
]
|
||||
][0]
|
||||
|
||||
config_cluster_networks_specific[
|
||||
f"{network_type}_dev_ip"
|
||||
] = f"{list(network.hosts())[address_id]}/{network.prefixlen}"
|
||||
config_cluster_networks_specific[f"{network_type}_dev_ip"] = (
|
||||
f"{list(network.hosts())[address_id]}/{network.prefixlen}"
|
||||
)
|
||||
|
||||
config = {**config, **config_cluster_networks_specific}
|
||||
|
||||
|
@ -69,6 +69,8 @@ def getNodeHealthDetails(zkhandler, node_name, node_health_plugins):
|
||||
plugin_message,
|
||||
plugin_data,
|
||||
) = tuple(all_plugin_data[pos_start:pos_end])
|
||||
if plugin_data is None:
|
||||
continue
|
||||
plugin_output = {
|
||||
"name": plugin,
|
||||
"last_run": int(plugin_last_run) if plugin_last_run is not None else None,
|
||||
@ -156,9 +158,9 @@ def getNodeInformation(zkhandler, node_name):
|
||||
zkhandler, node_name, node_health_plugins
|
||||
)
|
||||
|
||||
if _node_network_stats is not None:
|
||||
try:
|
||||
node_network_stats = json.loads(_node_network_stats)
|
||||
else:
|
||||
except Exception:
|
||||
node_network_stats = dict()
|
||||
|
||||
# Construct a data structure to represent the data
|
||||
|
27
debian/changelog
vendored
27
debian/changelog
vendored
@ -1,3 +1,30 @@
|
||||
pvc (0.9.98-0) unstable; urgency=high
|
||||
|
||||
* [CLI Client] Fixed output when API call times out
|
||||
* [Node Daemon] Improves the handling of fence states
|
||||
* [API Daemon/CLI Client] Adds support for storage snapshot rollback
|
||||
* [CLI Client] Adds additional warning messages about snapshot consistency to help output
|
||||
* [API Daemon] Fixes a bug listing snapshots by pool/volume
|
||||
* [Node Daemon] Adds a --version flag for information gathering by update-motd.sh
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Wed, 05 Jun 2024 12:01:31 -0400
|
||||
|
||||
pvc (0.9.97-0) unstable; urgency=high
|
||||
|
||||
* [Client CLI] Ensures --lines is always an integer value
|
||||
* [Node Daemon] Fixes a bug if d_network changes during iteration
|
||||
* [Node Daemon] Moves to using allocated instead of free memory for node reporting
|
||||
* [API Daemon] Fixes a bug if lingering RBD snapshots exist when removing a volume (#180)
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Fri, 19 Apr 2024 10:32:16 -0400
|
||||
|
||||
pvc (0.9.96-0) unstable; urgency=high
|
||||
|
||||
* [API Daemon] Fixes a bug when reporting node stats
|
||||
* [API Daemon] Fixes a bug deleteing successful benchmark results
|
||||
|
||||
-- Joshua M. Boniface <joshua@boniface.me> Fri, 08 Mar 2024 14:23:06 -0500
|
||||
|
||||
pvc (0.9.95-0) unstable; urgency=high
|
||||
|
||||
* [API Daemon/CLI Client] Adds a flag to allow duplicate VNIs in network templates
|
||||
|
@ -33,7 +33,7 @@ import os
|
||||
import signal
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.95"
|
||||
version = "0.9.98"
|
||||
|
||||
|
||||
##########################################################
|
||||
|
@ -19,6 +19,11 @@
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
from sys import argv
|
||||
import pvcnoded.Daemon # noqa: F401
|
||||
|
||||
if "--version" in argv:
|
||||
print(pvcnoded.Daemon.version)
|
||||
exit(0)
|
||||
|
||||
pvcnoded.Daemon.entrypoint()
|
||||
|
@ -49,7 +49,7 @@ import re
|
||||
import json
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.95"
|
||||
version = "0.9.98"
|
||||
|
||||
|
||||
##########################################################
|
||||
|
@ -231,7 +231,7 @@ class NetstatsInstance(object):
|
||||
# Get a list of all active interfaces
|
||||
net_root_path = "/sys/class/net"
|
||||
all_ifaces = list()
|
||||
for (_, dirnames, _) in walk(net_root_path):
|
||||
for _, dirnames, _ in walk(net_root_path):
|
||||
all_ifaces.extend(dirnames)
|
||||
all_ifaces.sort()
|
||||
|
||||
|
@ -521,7 +521,7 @@ class NodeInstance(object):
|
||||
self.logger.out("Acquired write lock for synchronization phase F", state="o")
|
||||
time.sleep(0.2) # Time fir reader to acquire the lock
|
||||
# 4. Add gateway IPs
|
||||
for network in self.d_network:
|
||||
for network in self.d_network.copy():
|
||||
self.d_network[network].createGateways()
|
||||
self.logger.out("Releasing write lock for synchronization phase F", state="i")
|
||||
self.zkhandler.write([("base.config.primary_node.sync_lock", "")])
|
||||
|
@ -253,12 +253,16 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
|
||||
state="i",
|
||||
prefix=f"fencing {node_name}",
|
||||
)
|
||||
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
|
||||
(
|
||||
ipmi_intermediate_status_retcode,
|
||||
ipmi_intermediate_status_stdout,
|
||||
ipmi_intermediate_status_stderr,
|
||||
) = common.run_os_command(
|
||||
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
|
||||
)
|
||||
if ipmi_status_retcode == 0:
|
||||
if ipmi_intermediate_status_retcode == 0:
|
||||
logger.out(
|
||||
f"Current chassis power state is: {ipmi_status_stdout.strip()}",
|
||||
f"Current chassis power state is: {ipmi_intermediate_status_stdout.strip()}",
|
||||
state="i",
|
||||
prefix=f"fencing {node_name}",
|
||||
)
|
||||
@ -299,12 +303,14 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
|
||||
state="i",
|
||||
prefix=f"fencing {node_name}",
|
||||
)
|
||||
ipmi_status_retcode, ipmi_status_stdout, ipmi_status_stderr = common.run_os_command(
|
||||
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
|
||||
ipmi_final_status_retcode, ipmi_final_status_stdout, ipmi_final_status_stderr = (
|
||||
common.run_os_command(
|
||||
f"/usr/bin/ipmitool -I lanplus -H {ipmi_hostname} -U {ipmi_user} -P {ipmi_password} chassis power status"
|
||||
)
|
||||
)
|
||||
|
||||
if ipmi_stop_retcode == 0:
|
||||
if ipmi_status_stdout.strip() == "Chassis Power is on":
|
||||
if ipmi_intermediate_status_stdout.strip() == "Chassis power is off":
|
||||
if ipmi_final_status_stdout.strip() == "Chassis Power is on":
|
||||
# We successfully rebooted the node and it is powered on; this is a succeessful fence
|
||||
logger.out(
|
||||
"Successfully rebooted dead node; proceeding with fence recovery action",
|
||||
@ -312,7 +318,7 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
|
||||
prefix=f"fencing {node_name}",
|
||||
)
|
||||
return True
|
||||
elif ipmi_status_stdout.strip() == "Chassis Power is off":
|
||||
elif ipmi_final_status_stdout.strip() == "Chassis Power is off":
|
||||
# We successfully rebooted the node but it is powered off; this might be expected or not, but the node is confirmed off so we can call it a successful fence
|
||||
logger.out(
|
||||
"Chassis power is in confirmed off state after successfuly IPMI reboot; proceeding with fence recovery action",
|
||||
@ -323,13 +329,13 @@ def reboot_via_ipmi(node_name, ipmi_hostname, ipmi_user, ipmi_password, logger):
|
||||
else:
|
||||
# We successfully rebooted the node but it is in some unknown power state; since this might indicate a silent failure, we must call it a failed fence
|
||||
logger.out(
|
||||
f"Chassis power is in an unknown state ({ipmi_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
|
||||
f"Chassis power is in an unknown state ({ipmi_final_status_stdout.strip()}) after successful IPMI reboot; NOT proceeding fence recovery action",
|
||||
state="e",
|
||||
prefix=f"fencing {node_name}",
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if ipmi_status_stdout.strip() == "Chassis Power is off":
|
||||
if ipmi_final_status_stdout.strip() == "Chassis Power is off":
|
||||
# We failed to reboot the node but it is powered off; it has probably suffered a serious hardware failure, but the node is confirmed off so we can call it a successful fence
|
||||
logger.out(
|
||||
"Chassis power is in confirmed off state after failed IPMI reboot; proceeding with fence recovery action",
|
||||
|
@ -743,7 +743,7 @@ def node_keepalive(logger, config, zkhandler, this_node, netstats):
|
||||
# Get node performance statistics
|
||||
this_node.memtotal = int(psutil.virtual_memory().total / 1024 / 1024)
|
||||
this_node.memused = int(psutil.virtual_memory().used / 1024 / 1024)
|
||||
this_node.memfree = int(psutil.virtual_memory().free / 1024 / 1024)
|
||||
this_node.memfree = int(psutil.virtual_memory().available / 1024 / 1024)
|
||||
this_node.cpuload = round(os.getloadavg()[0], 2)
|
||||
|
||||
# Get node network statistics via netstats instance
|
||||
|
@ -44,7 +44,7 @@ from daemon_lib.vmbuilder import (
|
||||
)
|
||||
|
||||
# Daemon version
|
||||
version = "0.9.95"
|
||||
version = "0.9.98"
|
||||
|
||||
|
||||
config = cfg.get_configuration()
|
||||
|
Reference in New Issue
Block a user