From 0587bcbd6771231c95cb466453d8f7016740793d Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 12 Aug 2020 22:16:56 -0400 Subject: [PATCH] Go back to manual command for OSD stats Using the Ceph library was a disaster here; it had no timeout or way to force it to continue, so keepalives would become stuck and trigger fence storms. Go back to the manual osd dump command with a 2s timeout which is far more reliable and can be adequately terminated if it runs long. --- node-daemon/pvcnoded/Daemon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index 26a7fe92..e01229a5 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -1149,7 +1149,8 @@ def collect_ceph_stats(queue): command = { "prefix": "osd dump", "format": "json" } try: - osd_dump_raw = json.loads(ceph_conn.mon_command(json.dumps(command), b'', timeout=1)[1])['osds'] + retcode, stdout, stderr = common.run_os_command('ceph osd dump --format json --connect-timeout 2', timeout=2) + osd_dump_raw = json.loads(stdout)['osds'] except Exception as e: logger.out('Failed to obtain OSD data: {}'.format(e), state='w') osd_dump_raw = []