Finish up Ceph OSD removal, add locking to commands

This commit is contained in:
2018-10-30 22:41:44 -04:00
parent 89a3e0c7ee
commit 3e4a6086d5
6 changed files with 200 additions and 44 deletions

View File

@ -22,6 +22,8 @@
import time
import ast
import json
import psutil
import pvcd.log as log
import pvcd.zkhandler as zkhandler
@ -53,7 +55,7 @@ class CephOSDInstance(object):
except AttributeError:
data = ''
if data != self.node:
if data and data != self.node:
self.node = data
@self.zk_conn.DataWatch('/ceph/osds/{}/stats'.format(self.osd_id))
@ -68,8 +70,8 @@ class CephOSDInstance(object):
except AttributeError:
data = ''
if data != self.stats:
self.stats = dict(ast.literal_eval(data))
if data and data != self.stats:
self.stats = json.loads(data)
def add_osd(zk_conn, logger, node, device):
# We are ready to create a new OSD on this node
@ -77,7 +79,8 @@ def add_osd(zk_conn, logger, node, device):
try:
# 1. Create an OSD; we do this so we know what ID will be gen'd
retcode, stdout, stderr = common.run_os_command('ceph osd create')
if retcode != 0:
if retcode:
print('ceph osd create')
print(stdout)
print(stderr)
raise
@ -85,7 +88,8 @@ def add_osd(zk_conn, logger, node, device):
# 2. Remove that newly-created OSD
retcode, stdout, stderr = common.run_os_command('ceph osd rm {}'.format(osd_id))
if retcode != 0:
if retcode:
print('ceph osd rm')
print(stdout)
print(stderr)
raise
@ -97,7 +101,8 @@ def add_osd(zk_conn, logger, node, device):
device=device
)
)
if retcode != 0:
if retcode:
print('ceph-volume lvm prepare')
print(stdout)
print(stderr)
raise
@ -108,7 +113,8 @@ def add_osd(zk_conn, logger, node, device):
osdid=osd_id
)
)
if retcode != 0:
if retcode:
print('ceph-volume lvm activate')
print(stdout)
print(stderr)
raise
@ -120,7 +126,8 @@ def add_osd(zk_conn, logger, node, device):
node=node
)
)
if retcode != 0:
if retcode:
print('ceph osd crush add')
print(stdout)
print(stderr)
raise
@ -132,7 +139,8 @@ def add_osd(zk_conn, logger, node, device):
osdid=osd_id
)
)
if retcode != 0:
if retcode:
print('systemctl status')
print(stdout)
print(stderr)
raise
@ -141,15 +149,16 @@ def add_osd(zk_conn, logger, node, device):
zkhandler.writedata(zk_conn, {
'/ceph/osds/{}'.format(osd_id): '',
'/ceph/osds/{}/node'.format(osd_id): node,
'/ceph/osds/{}/size'.format(osd_id): '',
'/ceph/osds/{}/stats'.format(osd_id): '{}'
})
# Log it
logger.out('Created new OSD disk with ID {}'.format(osd_id), state='o')
return True
except Exception as e:
# Log it
logger.out('Failed to create new OSD disk: {}'.format(e), state='e')
return False
def remove_osd(zk_conn, logger, osd_id, osd_obj):
logger.out('Removing OSD disk {}'.format(osd_id), state='i')
@ -163,16 +172,78 @@ def remove_osd(zk_conn, logger, osd_id, osd_obj):
# 1. Set the OSD out so it will flush
retcode, stdout, stderr = common.run_os_command('ceph osd out {}'.format(osd_id))
if retcode != 0:
if retcode:
print('ceph osd out')
print(stdout)
print(stderr)
raise
# 2. Wait for the OSD to flush
osd_string = str()
while True:
retcode, stdout, stderr = common.run_os_command('ceph health')
health_string = stdout
except:
pass
retcode, stdout, stderr = common.run_os_command('ceph pg dump osds --format json')
dump_string = json.loads(stdout)
for osd in dump_string:
if str(osd['osd']) == osd_id:
osd_string = osd
print(osd_string)
num_pgs = osd_string['num_pgs']
if num_pgs > 0:
time.sleep(5)
else:
break
# 3. Stop the OSD process and wait for it to be terminated
retcode, stdout, stderr = common.run_os_command('systemctl stop ceph-osd@{}'.format(osd_id))
if retcode:
print('systemctl stop')
print(stdout)
print(stderr)
raise
# FIXME: There has to be a better way to do this /shrug
while True:
is_osd_up = False
# Find if there is a process named ceph-osd with arg '--id {id}'
for p in psutil.process_iter(attrs=['name', 'cmdline']):
if 'ceph-osd' == p.info['name'] and '--id {}'.format(osd_id) in ' '.join(p.info['cmdline']):
is_osd_up = True
# If there isn't, continue
if not is_osd_up:
break
# 4. Delete OSD from ZK
zkhandler.deletekey(zk_conn, '/ceph/osds/{}'.format(osd_id))
# 5. Determine the block devices
retcode, stdout, stderr = common.run_os_command('readlink /var/lib/ceph/osd/ceph-{}/block'.format(osd_id))
vg_name = stdout.split('/')[-2] # e.g. /dev/ceph-<uuid>/osd-block-<uuid>
retcode, stdout, stderr = common.run_os_command('vgs --separator , --noheadings -o pv_name {}'.format(vg_name))
pv_block = stdout
# 6. Zap the volumes
retcode, stdout, stderr = common.run_os_command('ceph-volume lvm zap --destroy {}'.format(pv_block))
if retcode:
print('ceph-volume lvm zap')
print(stdout)
print(stderr)
raise
# 7. Purge the OSD from Ceph
retcode, stdout, stderr = common.run_os_command('ceph osd purge {} --yes-i-really-mean-it'.format(osd_id))
if retcode:
print('ceph osd purge')
print(stdout)
print(stderr)
raise
# Log it
logger.out('Purged OSD disk with ID {}'.format(osd_id), state='o')
return True
except Exception as e:
# Log it
logger.out('Failed to purge OSD disk with ID {}: {}'.format(osd_id, e), state='e')
return False
class CephPool(object):
def __init__(self):