Files
pvc-ansible/oneshot/upgrade-pvc-cluster_deb12.yml
Joshua M. Boniface 7f32414675 Add role run to initial D12 upgrade
Avoids errors failing out later if there are any issues with these
roles, and ensures the system is fully updated before the actual Debian
upgrades.
2024-06-29 02:31:43 -04:00

554 lines
17 KiB
YAML

---
# Play 1: Sanity check, initial configuration upgrade, information gathering, patroni freeze
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
vars:
minimum_pvc_version: 0.9.69
tasks:
- name: check that cluster is on the minimum PVC version or newer
shell: "dpkg --compare-versions $(pvc --quiet cluster status -f json | jq '.pvc_version' | tr -d '\"') ge {{ minimum_pvc_version }}"
run_once: yes
- name: set PVC maintenance mode
command: pvc cluster maintenance on
ignore_errors: yes
- name: include base role
import_role:
name: base
- name: include pvc role
import_role:
name: pvc
- name: get current postgresql directory
shell: "find /usr/lib/postgresql -mindepth 1 -maxdepth 1 -type d -name '[0-9][0-9]' | sort -n | tail -1"
register: old_postgres_dir_output
- name: set old_postgres_bin_dir fact
set_fact:
old_postgres_bin_dir: "{{ old_postgres_dir_output.stdout.strip() }}/bin"
- name: get current patroni leader node
shell: "patronictl -c /etc/patroni/config.yml list --format json | jq '.[] | select(.Role == \"Leader\") | .Member' | tr -d '\"'"
register: patroni_leader_output
- name: set patroni_leader fact
set_fact:
patroni_leader: "{% for node in ansible_play_hosts if node.split('.')[0].strip() == patroni_leader_output.stdout.strip() %}{{ node }}{% endfor %}"
- name: set patroni_followers fact
set_fact:
patroni_followers: "{{ ansible_play_hosts | difference([ patroni_leader ]) }}"
- debug:
var: patroni_leader
- debug:
var: patroni_followers
- name: fail out if patroni_leader is empty
command: "echo {{ patroni_leader }}"
register: check_output
failed_when: check_output.stdout == ""
- name: stop and mask patroni service on followers to perform database upgrade (later)
service:
name: patroni
state: stopped
masked: yes
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: stop and mask patroni service on leader to perform database upgrade (later)
service:
name: patroni
state: stopped
masked: yes
run_once: yes
delegate_to: "{{ patroni_leader }}"
# Play 2: Per-node upgrade to Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
- name: set Debian details (with ansible_distribution_*)
set_fact:
debian_version: "{{ ansible_distribution_major_version }}"
debian_codename: "{{ ansible_distribution_release }}"
when: ansible_distribution_release is defined
- name: set Debian details (with ansible_lsb)
set_fact:
debian_version: "{{ ansible_lsb.major_release }}"
debian_codename: "{{ ansible_lsb.codename }}"
when: ansible_lsb.codename is defined
- name: secondary node
command: "pvc node secondary {{ ansible_hostname }}"
ignore_errors: yes
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: flush node
command: "pvc node flush {{ ansible_hostname }} --wait"
ignore_errors: yes
- name: ensure VMs are migrated away
shell: "virsh list | grep running | wc -l"
register: virshcount
failed_when: virshcount.stdout != "0"
until: virshcount.stdout == "0"
retries: 60
delay: 10
- name: make sure all VMs have migrated
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
register: pvcflush
failed_when: pvcflush.stdout != 'flushed'
until: pvcflush.stdout == 'flushed'
retries: 60
delay: 10
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
become: no
connection: local
- name: stop PVC daemon cleanly
service:
name: pvcnoded
state: stopped
- name: stop Zookeeper daemon cleanly
service:
name: zookeeper
state: stopped
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
become: no
connection: local
- name: set OSD noout
command: pvc storage osd set noout
ignore_errors: yes
- name: get running OSD services
shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'"
ignore_errors: yes
register: osd_services
- name: stop Ceph OSD daemons cleanly
service:
name: "{{ item }}"
state: stopped
ignore_errors: yes
with_items: "{{ osd_services.stdout_lines }}"
- name: stop Ceph Monitor daemon cleanly
service:
name: "ceph-mon@{{ ansible_hostname }}"
state: stopped
ignore_errors: yes
- name: stop Ceph Manager daemon cleanly
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: stopped
ignore_errors: yes
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: remove possible obsolete cset configuration
file:
dest: "/etc/systemd/system/{{ item }}"
state: absent
with_items:
- ceph-osd@.service.d
- ceph-osd-cpuset.service
- name: replace sources.list entries will bookworm
replace:
dest: "{{ item }}"
regexp: "{{ debian_codename }}"
replace: "bookworm"
with_items:
- /etc/apt/sources.list
- name: replace sources.list non-free with non-free-firmware
replace:
dest: "{{ item }}"
regexp: "non-free$"
replace: "non-free-firmware"
with_items:
- /etc/apt/sources.list
- name: remove security entry if on Debian 10
lineinfile:
dest: /etc/apt/sources.list
regexp: "security.debian.org"
state: absent
when: debian_version|int < 11
- name: link python to python3 on Debian 10
file:
state: link
src: python3
dest: /usr/bin/python
force: yes
when: debian_version|int < 11
- name: update apt cache
apt:
update_cache: yes
register: apt_res
retries: 5
until: apt_res is success
# This seems insane, but works around a fatal error upgrading from d10 to d12
- name: perform initial upgrade on Debian 10
block:
- name: install script to perform safe d10->d12 upgrade
copy:
dest: /tmp/upgrade-d10.sh
mode: 0755
content: |
#!/usr/bin/env bash
exec 1>/tmp/upgrade-d10.log
exec 2>&1
set -o xtrace
export DEBIAN_FRONTEND=noninteractive
echo "Forcibly update libcrypt1 and libc6 to avoid conflicts"
apt-get --download-only install --yes libcrypt1
dpkg --force-all --install /var/cache/apt/archives/{libcrypt1,libpam0g,libc6}*.deb
echo "Fix broken packages"
apt --fix-broken --option Dpkg::Options::="--force-confold" install --yes
echo "Upgrade core apt packages (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes apt dpkg dpkg-dev base-files zstd
echo "Upgrade ca-certificates (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ca-certificates
echo "Upgrade sudo (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes sudo
echo "Upgrade core PVC packages (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ceph-osd ceph-mds ceph-mon ceph-mgr radosgw zookeeper zookeeper-bin patroni libvirt-daemon-system frr
echo "Perform main upgrade (forcing confnew)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" upgrade --yes
echo "Perform main dist-upgrade (forcing confnew)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" dist-upgrade --yes
echo "Restart SSH so Ansible can continue"
systemctl restart ssh
- name: run script to perform safe d10->d12 upgrade (will take a long time)
shell: /tmp/upgrade-d10.sh
- name: install python-is-python3
apt:
name: python-is-python3
state: latest
when: debian_version|int < 11
- name: apt upgrade
apt:
upgrade: safe
register: apt_res
retries: 5
until: apt_res is success
- name: apt dist upgrade and cleanup
apt:
autoremove: yes
autoclean: yes
upgrade: dist
register: apt_res
retries: 5
until: apt_res is success
- name: clean up obsolete kernels
command: /usr/local/sbin/kernel-cleanup.sh
- name: clean up obsolete packages
command: /usr/local/sbin/dpkg-cleanup.sh
- name: clean apt archives
file:
dest: /var/cache/apt/archives
state: absent
- name: regather facts
setup:
- name: include base role
import_role:
name: base
- name: include pvc role
import_role:
name: pvc
- name: apt full upgrade and cleanup
apt:
update_cache: yes
autoremove: yes
autoclean: yes
upgrade: full
register: apt_res
retries: 5
until: apt_res is success
- name: remove obsolete database directories
file:
dest: "{{ item }}"
state: absent
with_items:
- "/etc/postgresql/13"
- "/var/lib/postgresql/13"
- name: restart system
reboot:
post_reboot_delay: 15
reboot_timeout: 1800
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
- name: unset OSD noout
command: pvc storage osd unset noout
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: unflush node
command: "pvc node ready {{ ansible_hostname }} --wait"
- name: make sure all VMs have returned
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
register: pvcunflush
failed_when: pvcunflush.stdout != 'ready'
until: pvcunflush.stdout == 'ready'
retries: 60
delay: 10
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: reset any systemd failures
command: systemctl reset-failed
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
# Play 3: Ceph upgrades for Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
tasks:
- name: backup CRUSH map
command: ceph osd getcrushmap -o /srv/backups/backup-crushmap-deb12-upgrade
- block:
- name: disable insecure global id reclaim in Ceph
command: ceph config set mon auth_allow_insecure_global_id_reclaim false
- name: disable default pg autoscaler
command: ceph config set global osd_pool_default_pg_autoscale_mode off
- name: get pools
shell: "ceph osd lspools | awk '{ $1=\"\"; print $0 }' | tr -d ' '"
register: pool_output
- name: set pools fact
set_fact:
ceph_pools: "{{ pool_output.stdout.split('\n') | list }}"
- name: disable pg autoscaler on each pool
command: "ceph osd pool set {{ item }} pg_autoscale_mode off"
loop: "{{ ceph_pools }}"
- name: set OSDs to require pacific
command: ceph osd require-osd-release pacific
- name: update CRUSH map
command: ceph osd crush set-all-straw-buckets-to-straw2
- name: set OSDs to quick fsck
command: ceph config set osd bluestore_fsck_quick_fix_on_mount true
run_once: yes
- name: restart ceph-mgr
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: restarted
# Play 4: Ceph OSD restart (serial per-node)
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
- name: restart all OSDs on node
shell: "systemctl restart --all ceph-osd@*"
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
# Play 4: Patroni upgrades for Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
vars:
new_postgres_version: 15
tasks:
- name: stop patroni service on followers to perform database upgrade
service:
name: patroni
state: stopped
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: stop patroni service on leader to perform database upgrade
service:
name: patroni
state: stopped
run_once: yes
delegate_to: "{{ patroni_leader }}"
- block:
- name: initialize new postgres database
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/initdb -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: enable data checksums in new database
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_checksums --enable /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: run postgres upgrade
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_upgrade -b {{ old_postgres_bin_dir }} -d /var/lib/postgresql/patroni/pvc -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: move old postgres database out of the way
shell:
cmd: "sudo -u postgres mv /var/lib/postgresql/patroni/pvc /var/lib/postgresql/patroni/pvc.old"
chdir: "/var/lib/postgresql"
- name: move new postgres database into place
shell:
cmd: "sudo -u postgres mv /var/lib/postgresql/{{ new_postgres_version }}/pvc /var/lib/postgresql/patroni/pvc"
chdir: "/var/lib/postgresql"
- name: ensure recovery.conf is absent
file:
dest: /var/lib/postgresql/patroni/pvc/recovery.conf
state: absent
- name: delete patroni cluster znodes
shell: "/usr/share/zookeeper/bin/zkCli.sh -server {{ ansible_hostname }}:2181 deleteall /patroni/pvc"
- name: start patroni service on leader
service:
name: patroni
state: started
masked: no
run_once: yes
delegate_to: "{{ patroni_leader }}"
- name: remove old data directory on patroni followers
file:
dest: /var/lib/postgresql/patroni/pvc
state: absent
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: start patroni service on followers
service:
name: patroni
state: started
masked: no
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: restart PVC daemons on all nodes
shell: "systemctl restart pvchealthd && systemctl restart pvcworkerd && systemctl restart pvcnoded && sleep 30"
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ ansible_play_hosts }}"
- name: set first node as primary coordinator
command: "pvc node primary --wait {{ ansible_play_hosts[0].split('.')[0] }}"
run_once: yes
delegate_to: "{{ ansible_play_hosts[0] }}"
- name: unset PVC maintenance mode
command: pvc cluster maintenance off