Avoids errors failing out later if there are any issues with these roles, and ensures the system is fully updated before the actual Debian upgrades.
554 lines
17 KiB
YAML
554 lines
17 KiB
YAML
---
|
|
# Play 1: Sanity check, initial configuration upgrade, information gathering, patroni freeze
|
|
- hosts: all
|
|
remote_user: deploy
|
|
become: yes
|
|
become_user: root
|
|
gather_facts: yes
|
|
vars:
|
|
minimum_pvc_version: 0.9.69
|
|
tasks:
|
|
- name: check that cluster is on the minimum PVC version or newer
|
|
shell: "dpkg --compare-versions $(pvc --quiet cluster status -f json | jq '.pvc_version' | tr -d '\"') ge {{ minimum_pvc_version }}"
|
|
run_once: yes
|
|
|
|
- name: set PVC maintenance mode
|
|
command: pvc cluster maintenance on
|
|
ignore_errors: yes
|
|
|
|
- name: include base role
|
|
import_role:
|
|
name: base
|
|
|
|
- name: include pvc role
|
|
import_role:
|
|
name: pvc
|
|
|
|
- name: get current postgresql directory
|
|
shell: "find /usr/lib/postgresql -mindepth 1 -maxdepth 1 -type d -name '[0-9][0-9]' | sort -n | tail -1"
|
|
register: old_postgres_dir_output
|
|
|
|
- name: set old_postgres_bin_dir fact
|
|
set_fact:
|
|
old_postgres_bin_dir: "{{ old_postgres_dir_output.stdout.strip() }}/bin"
|
|
|
|
- name: get current patroni leader node
|
|
shell: "patronictl -c /etc/patroni/config.yml list --format json | jq '.[] | select(.Role == \"Leader\") | .Member' | tr -d '\"'"
|
|
register: patroni_leader_output
|
|
|
|
- name: set patroni_leader fact
|
|
set_fact:
|
|
patroni_leader: "{% for node in ansible_play_hosts if node.split('.')[0].strip() == patroni_leader_output.stdout.strip() %}{{ node }}{% endfor %}"
|
|
|
|
- name: set patroni_followers fact
|
|
set_fact:
|
|
patroni_followers: "{{ ansible_play_hosts | difference([ patroni_leader ]) }}"
|
|
|
|
- debug:
|
|
var: patroni_leader
|
|
- debug:
|
|
var: patroni_followers
|
|
|
|
- name: fail out if patroni_leader is empty
|
|
command: "echo {{ patroni_leader }}"
|
|
register: check_output
|
|
failed_when: check_output.stdout == ""
|
|
|
|
- name: stop and mask patroni service on followers to perform database upgrade (later)
|
|
service:
|
|
name: patroni
|
|
state: stopped
|
|
masked: yes
|
|
run_once: yes
|
|
delegate_to: "{{ item }}"
|
|
loop: "{{ patroni_followers }}"
|
|
|
|
- name: stop and mask patroni service on leader to perform database upgrade (later)
|
|
service:
|
|
name: patroni
|
|
state: stopped
|
|
masked: yes
|
|
run_once: yes
|
|
delegate_to: "{{ patroni_leader }}"
|
|
|
|
# Play 2: Per-node upgrade to Debian 12
|
|
- hosts: all
|
|
remote_user: deploy
|
|
become: yes
|
|
become_user: root
|
|
gather_facts: yes
|
|
serial: 1
|
|
tasks:
|
|
- name: set Debian details (with ansible_distribution_*)
|
|
set_fact:
|
|
debian_version: "{{ ansible_distribution_major_version }}"
|
|
debian_codename: "{{ ansible_distribution_release }}"
|
|
when: ansible_distribution_release is defined
|
|
|
|
- name: set Debian details (with ansible_lsb)
|
|
set_fact:
|
|
debian_version: "{{ ansible_lsb.major_release }}"
|
|
debian_codename: "{{ ansible_lsb.codename }}"
|
|
when: ansible_lsb.codename is defined
|
|
|
|
- name: secondary node
|
|
command: "pvc node secondary {{ ansible_hostname }}"
|
|
ignore_errors: yes
|
|
|
|
- name: wait 30 seconds for system to stabilize
|
|
pause:
|
|
seconds: 30
|
|
become: no
|
|
connection: local
|
|
|
|
- name: flush node
|
|
command: "pvc node flush {{ ansible_hostname }} --wait"
|
|
ignore_errors: yes
|
|
|
|
- name: ensure VMs are migrated away
|
|
shell: "virsh list | grep running | wc -l"
|
|
register: virshcount
|
|
failed_when: virshcount.stdout != "0"
|
|
until: virshcount.stdout == "0"
|
|
retries: 60
|
|
delay: 10
|
|
|
|
- name: make sure all VMs have migrated
|
|
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
|
|
register: pvcflush
|
|
failed_when: pvcflush.stdout != 'flushed'
|
|
until: pvcflush.stdout == 'flushed'
|
|
retries: 60
|
|
delay: 10
|
|
|
|
- name: wait 15 seconds for system to stabilize
|
|
pause:
|
|
seconds: 15
|
|
become: no
|
|
connection: local
|
|
|
|
- name: stop PVC daemon cleanly
|
|
service:
|
|
name: pvcnoded
|
|
state: stopped
|
|
|
|
- name: stop Zookeeper daemon cleanly
|
|
service:
|
|
name: zookeeper
|
|
state: stopped
|
|
|
|
- name: wait 15 seconds for system to stabilize
|
|
pause:
|
|
seconds: 15
|
|
become: no
|
|
connection: local
|
|
|
|
- name: set OSD noout
|
|
command: pvc storage osd set noout
|
|
ignore_errors: yes
|
|
|
|
- name: get running OSD services
|
|
shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'"
|
|
ignore_errors: yes
|
|
register: osd_services
|
|
|
|
- name: stop Ceph OSD daemons cleanly
|
|
service:
|
|
name: "{{ item }}"
|
|
state: stopped
|
|
ignore_errors: yes
|
|
with_items: "{{ osd_services.stdout_lines }}"
|
|
|
|
- name: stop Ceph Monitor daemon cleanly
|
|
service:
|
|
name: "ceph-mon@{{ ansible_hostname }}"
|
|
state: stopped
|
|
ignore_errors: yes
|
|
|
|
- name: stop Ceph Manager daemon cleanly
|
|
service:
|
|
name: "ceph-mgr@{{ ansible_hostname }}"
|
|
state: stopped
|
|
ignore_errors: yes
|
|
|
|
- name: wait 30 seconds for system to stabilize
|
|
pause:
|
|
seconds: 30
|
|
become: no
|
|
connection: local
|
|
|
|
- name: remove possible obsolete cset configuration
|
|
file:
|
|
dest: "/etc/systemd/system/{{ item }}"
|
|
state: absent
|
|
with_items:
|
|
- ceph-osd@.service.d
|
|
- ceph-osd-cpuset.service
|
|
|
|
- name: replace sources.list entries will bookworm
|
|
replace:
|
|
dest: "{{ item }}"
|
|
regexp: "{{ debian_codename }}"
|
|
replace: "bookworm"
|
|
with_items:
|
|
- /etc/apt/sources.list
|
|
|
|
- name: replace sources.list non-free with non-free-firmware
|
|
replace:
|
|
dest: "{{ item }}"
|
|
regexp: "non-free$"
|
|
replace: "non-free-firmware"
|
|
with_items:
|
|
- /etc/apt/sources.list
|
|
|
|
- name: remove security entry if on Debian 10
|
|
lineinfile:
|
|
dest: /etc/apt/sources.list
|
|
regexp: "security.debian.org"
|
|
state: absent
|
|
when: debian_version|int < 11
|
|
|
|
- name: link python to python3 on Debian 10
|
|
file:
|
|
state: link
|
|
src: python3
|
|
dest: /usr/bin/python
|
|
force: yes
|
|
when: debian_version|int < 11
|
|
|
|
- name: update apt cache
|
|
apt:
|
|
update_cache: yes
|
|
register: apt_res
|
|
retries: 5
|
|
until: apt_res is success
|
|
|
|
# This seems insane, but works around a fatal error upgrading from d10 to d12
|
|
- name: perform initial upgrade on Debian 10
|
|
block:
|
|
- name: install script to perform safe d10->d12 upgrade
|
|
copy:
|
|
dest: /tmp/upgrade-d10.sh
|
|
mode: 0755
|
|
content: |
|
|
#!/usr/bin/env bash
|
|
exec 1>/tmp/upgrade-d10.log
|
|
exec 2>&1
|
|
set -o xtrace
|
|
export DEBIAN_FRONTEND=noninteractive
|
|
echo "Forcibly update libcrypt1 and libc6 to avoid conflicts"
|
|
apt-get --download-only install --yes libcrypt1
|
|
dpkg --force-all --install /var/cache/apt/archives/{libcrypt1,libpam0g,libc6}*.deb
|
|
echo "Fix broken packages"
|
|
apt --fix-broken --option Dpkg::Options::="--force-confold" install --yes
|
|
echo "Upgrade core apt packages (forcing confold)"
|
|
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes apt dpkg dpkg-dev base-files zstd
|
|
echo "Upgrade ca-certificates (forcing confold)"
|
|
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ca-certificates
|
|
echo "Upgrade sudo (forcing confold)"
|
|
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes sudo
|
|
echo "Upgrade core PVC packages (forcing confold)"
|
|
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ceph-osd ceph-mds ceph-mon ceph-mgr radosgw zookeeper zookeeper-bin patroni libvirt-daemon-system frr
|
|
echo "Perform main upgrade (forcing confnew)"
|
|
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" upgrade --yes
|
|
echo "Perform main dist-upgrade (forcing confnew)"
|
|
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" dist-upgrade --yes
|
|
echo "Restart SSH so Ansible can continue"
|
|
systemctl restart ssh
|
|
|
|
- name: run script to perform safe d10->d12 upgrade (will take a long time)
|
|
shell: /tmp/upgrade-d10.sh
|
|
|
|
- name: install python-is-python3
|
|
apt:
|
|
name: python-is-python3
|
|
state: latest
|
|
|
|
when: debian_version|int < 11
|
|
|
|
- name: apt upgrade
|
|
apt:
|
|
upgrade: safe
|
|
register: apt_res
|
|
retries: 5
|
|
until: apt_res is success
|
|
|
|
- name: apt dist upgrade and cleanup
|
|
apt:
|
|
autoremove: yes
|
|
autoclean: yes
|
|
upgrade: dist
|
|
register: apt_res
|
|
retries: 5
|
|
until: apt_res is success
|
|
|
|
- name: clean up obsolete kernels
|
|
command: /usr/local/sbin/kernel-cleanup.sh
|
|
|
|
- name: clean up obsolete packages
|
|
command: /usr/local/sbin/dpkg-cleanup.sh
|
|
|
|
- name: clean apt archives
|
|
file:
|
|
dest: /var/cache/apt/archives
|
|
state: absent
|
|
|
|
- name: regather facts
|
|
setup:
|
|
|
|
- name: include base role
|
|
import_role:
|
|
name: base
|
|
|
|
- name: include pvc role
|
|
import_role:
|
|
name: pvc
|
|
|
|
- name: apt full upgrade and cleanup
|
|
apt:
|
|
update_cache: yes
|
|
autoremove: yes
|
|
autoclean: yes
|
|
upgrade: full
|
|
register: apt_res
|
|
retries: 5
|
|
until: apt_res is success
|
|
|
|
- name: remove obsolete database directories
|
|
file:
|
|
dest: "{{ item }}"
|
|
state: absent
|
|
with_items:
|
|
- "/etc/postgresql/13"
|
|
- "/var/lib/postgresql/13"
|
|
|
|
- name: restart system
|
|
reboot:
|
|
post_reboot_delay: 15
|
|
reboot_timeout: 1800
|
|
|
|
- name: make sure all OSDs are active
|
|
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
|
|
register: osdstat
|
|
failed_when: osdstat.stdout == "NOK"
|
|
until: osdstat.stdout == "OK"
|
|
retries: 60
|
|
delay: 10
|
|
|
|
- name: make sure all PGs have recovered
|
|
shell: "ceph health | grep -wo 'Degraded data redundancy'"
|
|
register: cephhealth
|
|
failed_when: cephhealth.stdout == "Degraded data redundancy'"
|
|
until: cephhealth.stdout == ""
|
|
retries: 60
|
|
delay: 10
|
|
|
|
- name: unset OSD noout
|
|
command: pvc storage osd unset noout
|
|
|
|
- name: wait 30 seconds for system to stabilize
|
|
pause:
|
|
seconds: 30
|
|
become: no
|
|
connection: local
|
|
|
|
- name: unflush node
|
|
command: "pvc node ready {{ ansible_hostname }} --wait"
|
|
|
|
- name: make sure all VMs have returned
|
|
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
|
|
register: pvcunflush
|
|
failed_when: pvcunflush.stdout != 'ready'
|
|
until: pvcunflush.stdout == 'ready'
|
|
retries: 60
|
|
delay: 10
|
|
|
|
- name: wait 30 seconds for system to stabilize
|
|
pause:
|
|
seconds: 30
|
|
become: no
|
|
connection: local
|
|
|
|
- name: reset any systemd failures
|
|
command: systemctl reset-failed
|
|
|
|
- name: wait 30 seconds for system to stabilize
|
|
pause:
|
|
seconds: 30
|
|
become: no
|
|
connection: local
|
|
|
|
# Play 3: Ceph upgrades for Debian 12
|
|
- hosts: all
|
|
remote_user: deploy
|
|
become: yes
|
|
become_user: root
|
|
gather_facts: yes
|
|
tasks:
|
|
- name: backup CRUSH map
|
|
command: ceph osd getcrushmap -o /srv/backups/backup-crushmap-deb12-upgrade
|
|
|
|
- block:
|
|
- name: disable insecure global id reclaim in Ceph
|
|
command: ceph config set mon auth_allow_insecure_global_id_reclaim false
|
|
|
|
- name: disable default pg autoscaler
|
|
command: ceph config set global osd_pool_default_pg_autoscale_mode off
|
|
|
|
- name: get pools
|
|
shell: "ceph osd lspools | awk '{ $1=\"\"; print $0 }' | tr -d ' '"
|
|
register: pool_output
|
|
|
|
- name: set pools fact
|
|
set_fact:
|
|
ceph_pools: "{{ pool_output.stdout.split('\n') | list }}"
|
|
|
|
- name: disable pg autoscaler on each pool
|
|
command: "ceph osd pool set {{ item }} pg_autoscale_mode off"
|
|
loop: "{{ ceph_pools }}"
|
|
|
|
- name: set OSDs to require pacific
|
|
command: ceph osd require-osd-release pacific
|
|
|
|
- name: update CRUSH map
|
|
command: ceph osd crush set-all-straw-buckets-to-straw2
|
|
|
|
- name: set OSDs to quick fsck
|
|
command: ceph config set osd bluestore_fsck_quick_fix_on_mount true
|
|
run_once: yes
|
|
|
|
- name: restart ceph-mgr
|
|
service:
|
|
name: "ceph-mgr@{{ ansible_hostname }}"
|
|
state: restarted
|
|
|
|
# Play 4: Ceph OSD restart (serial per-node)
|
|
- hosts: all
|
|
remote_user: deploy
|
|
become: yes
|
|
become_user: root
|
|
gather_facts: yes
|
|
serial: 1
|
|
tasks:
|
|
- name: restart all OSDs on node
|
|
shell: "systemctl restart --all ceph-osd@*"
|
|
|
|
- name: make sure all OSDs are active
|
|
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
|
|
register: osdstat
|
|
failed_when: osdstat.stdout == "NOK"
|
|
until: osdstat.stdout == "OK"
|
|
retries: 60
|
|
delay: 10
|
|
|
|
- name: make sure all PGs have recovered
|
|
shell: "ceph health | grep -wo 'Degraded data redundancy'"
|
|
register: cephhealth
|
|
failed_when: cephhealth.stdout == "Degraded data redundancy'"
|
|
until: cephhealth.stdout == ""
|
|
retries: 60
|
|
delay: 10
|
|
|
|
# Play 4: Patroni upgrades for Debian 12
|
|
- hosts: all
|
|
remote_user: deploy
|
|
become: yes
|
|
become_user: root
|
|
gather_facts: yes
|
|
vars:
|
|
new_postgres_version: 15
|
|
tasks:
|
|
- name: stop patroni service on followers to perform database upgrade
|
|
service:
|
|
name: patroni
|
|
state: stopped
|
|
run_once: yes
|
|
delegate_to: "{{ item }}"
|
|
loop: "{{ patroni_followers }}"
|
|
|
|
- name: stop patroni service on leader to perform database upgrade
|
|
service:
|
|
name: patroni
|
|
state: stopped
|
|
run_once: yes
|
|
delegate_to: "{{ patroni_leader }}"
|
|
|
|
- block:
|
|
- name: initialize new postgres database
|
|
shell:
|
|
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/initdb -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
|
|
chdir: "/var/lib/postgresql"
|
|
|
|
- name: enable data checksums in new database
|
|
shell:
|
|
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_checksums --enable /var/lib/postgresql/{{ new_postgres_version }}/pvc"
|
|
chdir: "/var/lib/postgresql"
|
|
|
|
- name: run postgres upgrade
|
|
shell:
|
|
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_upgrade -b {{ old_postgres_bin_dir }} -d /var/lib/postgresql/patroni/pvc -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
|
|
chdir: "/var/lib/postgresql"
|
|
|
|
- name: move old postgres database out of the way
|
|
shell:
|
|
cmd: "sudo -u postgres mv /var/lib/postgresql/patroni/pvc /var/lib/postgresql/patroni/pvc.old"
|
|
chdir: "/var/lib/postgresql"
|
|
|
|
- name: move new postgres database into place
|
|
shell:
|
|
cmd: "sudo -u postgres mv /var/lib/postgresql/{{ new_postgres_version }}/pvc /var/lib/postgresql/patroni/pvc"
|
|
chdir: "/var/lib/postgresql"
|
|
|
|
- name: ensure recovery.conf is absent
|
|
file:
|
|
dest: /var/lib/postgresql/patroni/pvc/recovery.conf
|
|
state: absent
|
|
|
|
- name: delete patroni cluster znodes
|
|
shell: "/usr/share/zookeeper/bin/zkCli.sh -server {{ ansible_hostname }}:2181 deleteall /patroni/pvc"
|
|
|
|
- name: start patroni service on leader
|
|
service:
|
|
name: patroni
|
|
state: started
|
|
masked: no
|
|
run_once: yes
|
|
delegate_to: "{{ patroni_leader }}"
|
|
|
|
- name: remove old data directory on patroni followers
|
|
file:
|
|
dest: /var/lib/postgresql/patroni/pvc
|
|
state: absent
|
|
run_once: yes
|
|
delegate_to: "{{ item }}"
|
|
loop: "{{ patroni_followers }}"
|
|
|
|
- name: start patroni service on followers
|
|
service:
|
|
name: patroni
|
|
state: started
|
|
masked: no
|
|
run_once: yes
|
|
delegate_to: "{{ item }}"
|
|
loop: "{{ patroni_followers }}"
|
|
|
|
- name: wait 30 seconds for system to stabilize
|
|
pause:
|
|
seconds: 30
|
|
become: no
|
|
connection: local
|
|
|
|
- name: restart PVC daemons on all nodes
|
|
shell: "systemctl restart pvchealthd && systemctl restart pvcworkerd && systemctl restart pvcnoded && sleep 30"
|
|
run_once: yes
|
|
delegate_to: "{{ item }}"
|
|
loop: "{{ ansible_play_hosts }}"
|
|
|
|
- name: set first node as primary coordinator
|
|
command: "pvc node primary --wait {{ ansible_play_hosts[0].split('.')[0] }}"
|
|
run_once: yes
|
|
delegate_to: "{{ ansible_play_hosts[0] }}"
|
|
|
|
- name: unset PVC maintenance mode
|
|
command: pvc cluster maintenance off
|