Files
pvc-ansible/oneshot/update-pvc-cluster.yml
Joshua M. Boniface cd164d1984 Increase all wait timeouts to 30s
Ensure that even on slow(er) clusters, these timeouts have more time to
complete before proceeding so the task won't fail.
2021-01-05 16:17:19 -05:00

165 lines
4.5 KiB
YAML

---
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
- name: set PVC maintenance mode
command: pvc maintenance on
- name: aptitude full upgrade and cleanup
apt:
update_cache: "yes"
autoremove: "yes"
autoclean: "yes"
upgrade: "full"
- name: clean apt archives
file:
dest: "/var/cache/apt/archives"
state: "absent"
- name: check library freshness
command: /usr/lib/check_mk_agent/plugins/freshness
register: freshness
changed_when: freshness.rc == 1
failed_when: false
- name: check kernel version
command: /usr/lib/check_mk_agent/plugins/kernelversion
register: kernelversion
changed_when: kernelversion.rc == 1
failed_when: false
- name: restart system cleanly
block:
- name: secondary node
command: 'pvc node secondary {{ ansible_hostname }}'
ignore_errors: true
- name: wait 30 seconds for system to stabilize
pause:
seconds: "30"
become: no
connection: local
- name: flush node
command: 'pvc node flush {{ ansible_hostname }} --wait'
- name: ensure VMs are migrated away
shell: "virsh list | grep running | wc -l"
register: virshcount
failed_when: virshcount.stdout != "0"
until: virshcount.stdout == "0"
retries: 60
delay: 10
- name: wait 30 seconds for system to stabilize
pause:
seconds: "30"
become: no
connection: local
- name: set OSD noout
command: pvc storage osd set noout
- name: get running OSD services
shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'"
ignore_errors: true
register: osd_services
- name: stop Ceph OSD daemons cleanly
service:
name: "{{ item }}"
state: stopped
ignore_errors: true
with_items: "{{ osd_services.stdout_lines }}"
- name: stop Ceph Monitor daemon cleanly
service:
name: "ceph-mon@{{ ansible_hostname }}"
state: stopped
ignore_errors: true
- name: stop Ceph Manager daemon cleanly
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: stopped
ignore_errors: true
- name: wait 30 seconds for system to stabilize
pause:
seconds: "30"
become: no
connection: local
- name: stop and disable PVC flush daemon cleanly
service:
name: "pvc-flush"
state: stopped
enabled: no
- name: stop PVC daemon cleanly
service:
name: "pvcnoded"
state: stopped
- name: stop Zookeeper daemon cleanly
service:
name: "zookeeper"
state: stopped
- name: restart system
reboot:
post_reboot_delay: 15
reboot_timeout: 1800
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
- name: unset OSD noout
command: pvc storage osd unset noout
- name: unflush node
command: "pvc node ready {{ ansible_hostname }} --wait"
- name: wait 30 seconds for system to stabilize
pause:
seconds: "30"
become: no
connection: local
- name: start and enable PVC flush daemon cleanly
service:
name: "pvc-flush"
state: started
enabled: yes
- name: reset any systemd failures
command: systemctl reset-failed
when: freshness.changed or kernelversion.changed
- name: set PVC maintenance mode
command: pvc maintenance off
- name: wait 30 seconds for system to stabilize
pause:
seconds: "30"
become: no
connection: local