Skip to content

Commit 39a32d4

Browse files
committed
Add the steps to reboot the computes after update.
This sequence implements reboot of the compute nodes after the update. We have one instance created. If the hypervisor being rebooted has the instance that instance will be live-migrated to another hypervisor before the reboot and migrated back to that original hypervisor after the reboot. Some basic sanity checks are performed after the reboot and before the migration back to ensure that the necessary services are up and running. During the reboot we start two scripts. One monitors and log the reboot of the hypervisors. The other log where the instance is currently running. The log files can be found in `~/ci-framework-data/tests/update/` in `monitor_servers.log` and `monitor_vm_placement.log` respectively. A note about node evacuation. We are still using node evaction from the nova cli. This command has not been ported to the openstack cli. There's a discussion about it [on launchpad](https://bugs.launchpad.net/python-openstackclient/+bug/2055552). The official documentation mention only the live-migration path, but as we also use the live-migration in the test sequence that part is covered. We still expect customer to use the nova cli as it's way more user friendly and is still currently working. Closes: https://issues.redhat.com/browse/OSPRH-8937
1 parent bb6ae8e commit 39a32d4

File tree

10 files changed

+406
-2
lines changed

10 files changed

+406
-2
lines changed

roles/update/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,7 @@ Role to run update
1616
* `cifmw_update_ctl_plane_max_fail`: (Integer) For continuous control plane testing, maximum number of failures allowed. Default to 3.
1717
* `cifmw_update_ctl_plane_max_tries`: (Integer) For continuous control plane testing, number of retries allowed to stop and destroy the last vm created. Each retry is 5 seconds apart. Default to 84, so 7 minutes.
1818
* `cifmw_update_openstackclient_pod_timeout`: (Integer) Maximum number of seconds to wait for the openstackclient Pod to be available during control plane testing, as it is being restarted during update. Default to `10` seconds.
19-
19+
* `cifmw_update_reboot_test`: (Boolean) Activate the reboot test after update. Default to `True`.
20+
* `cifmw_update_ansible_ssh_private_key_file`: (String) Define the path to the private key file used for the compute nodes.
21+
* `cifmw_update_wait_retries_reboot`: (Integer) Number of retries to wait for a compute node reboot. One retry is done every five seconds. Default to 60, so five minutes.
2022
## Examples

roles/update/defaults/main.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,15 @@ cifmw_update_timestamper_cmd: >-
3737
cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_start_ping.sh"
3838
cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh"
3939

40+
# Operation in the openstack namespace
41+
cifmw_update_openstack_cmd: >-
42+
oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack
43+
4044
## User facing
45+
cifmw_update_reboot_test: true
46+
cifmw_update_ansible_ssh_private_key_file: >-
47+
"{{ ansible_ssh_private_key_file | default(ansible_user_dir ~ '/.ssh/id_cifw') }}"
48+
cifmw_update_wait_retries_reboot: 60
4149

4250
cifmw_update_ping_test: false
4351
cifmw_update_create_volume: false

roles/update/molecule/default/prepare.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
- role: ci_setup
2929
- role: install_yamls
3030
tasks:
31-
- name: Set custom cifmw PATH reusable fact
31+
- name: Set custom some reusable facts
3232
ansible.builtin.set_fact:
3333
cifmw_path: "{{ ansible_user_dir }}/.crc/bin:{{ ansible_user_dir }}/.crc/bin/oc:{{ ansible_user_dir }}/bin:{{ ansible_env.PATH }}"
34+
cifmw_update_reboot_test: false
3435
cacheable: true

roles/update/tasks/main.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,8 @@
7474
- not cifmw_update_run_dryrun | bool
7575
ansible.builtin.command: |
7676
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh
77+
78+
- name: Reboot the compute nodes
79+
ansible.builtin.include_tasks: reboot_computes.yml
80+
when:
81+
- cifmw_update_reboot_test | bool
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
- name: Register storage backend type
2+
environment:
3+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
4+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
5+
ansible.builtin.shell: >-
6+
set -o pipefail;
7+
{{ cifmw_update_openstack_cmd }} volume service list -f json |
8+
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host'
9+
register: storage_backend
10+
changed_when: false
11+
12+
- name: Get the list of OpenStack hypervisors
13+
environment:
14+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
15+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
16+
ansible.builtin.shell: |
17+
{{ cifmw_update_openstack_cmd }} hypervisor list -f json
18+
register: hypervisor_list
19+
changed_when: false
20+
21+
- name: Parse the hypervisor list to extract hostnames
22+
ansible.builtin.set_fact:
23+
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}"
24+
25+
- name: Create a reboot monitor servers script
26+
ansible.builtin.template:
27+
src: "monitor_servers.sh.j2"
28+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_servers.sh"
29+
mode: "0775"
30+
31+
- name: Start the monitor servers script
32+
ansible.builtin.shell: |
33+
nohup {{ cifmw_update_artifacts_basedir }}/monitor_servers.sh &> /dev/null &
34+
echo $!
35+
register: monitor_servers_job
36+
37+
- name: Create a monitor placement monitor script
38+
ansible.builtin.template:
39+
src: "monitor_vm_placement.sh.j2"
40+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh"
41+
mode: "0775"
42+
43+
- name: Start the monitor placement script
44+
ansible.builtin.shell: |
45+
nohup {{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh &> /dev/null &
46+
echo $!
47+
register: monitor_placement_job
48+
49+
- name: Iterate over each hypervisor for the reboot sequence
50+
ansible.builtin.include_tasks: reboot_hypervisor.yml
51+
loop: "{{ hypervisor_hostnames }}"
52+
loop_control:
53+
loop_var: hypervisor
54+
55+
- name: Stop the monitor servers script if running
56+
ansible.builtin.shell: |
57+
if kill -0 {{ monitor_servers_job.stdout }} &>/dev/null; then
58+
kill {{ monitor_servers_job.stdout }}
59+
fi
60+
register: kill_result
61+
failed_when: kill_result.rc not in [0, 1] # We can still have a race
62+
# between kill -0 and
63+
# kill, even if unlikely.
64+
65+
- name: Stop the monitor placement script if running
66+
ansible.builtin.shell: |
67+
if kill -0 {{ monitor_placement_job.stdout }} &>/dev/null; then
68+
kill {{ monitor_placement_job.stdout }}
69+
fi
70+
register: kill_result
71+
failed_when: kill_result.rc not in [0, 1]
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
---
2+
- name: Extract short hostname from FQDN
3+
ansible.builtin.set_fact:
4+
cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}"
5+
6+
- name: Display current stage
7+
ansible.builtin.debug:
8+
msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}"
9+
10+
- name: Define command for nova interaction
11+
ansible.builtin.set_fact:
12+
cifmw_update_bash_cmd: >-
13+
oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c
14+
15+
- name: Check active VMs on hypervisor
16+
environment:
17+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
18+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
19+
ansible.builtin.shell: >-
20+
set -o pipefail;
21+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
22+
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID'
23+
register: active_vms
24+
changed_when: false
25+
26+
- name: Evacuate VMs if they are running
27+
environment:
28+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
29+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
30+
ansible.builtin.shell: >-
31+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
32+
nova host-evacuate-live
33+
{% if 'ceph' not in storage_backend.stdout %}
34+
--block-migrate
35+
{% endif %}
36+
{{ hypervisor }}"
37+
when: active_vms.stdout != ''
38+
changed_when: true
39+
40+
- name: Wait for compute node to get quiesced
41+
environment:
42+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
43+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
44+
ansible.builtin.shell: >-
45+
set -o pipefail;
46+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
47+
| jq -r -c '[.[] | select(.Status |
48+
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))]
49+
| length'
50+
register: compute_node_instances
51+
until: compute_node_instances.stdout.find("0") > -1
52+
retries: 30
53+
delay: 5
54+
when:
55+
- active_vms.stdout != ''
56+
57+
- name: Reboot the hypervisors using CR
58+
ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml
59+
60+
- name: Perform sanity checks post-reboot
61+
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml
62+
vars:
63+
current_hypervisor: "{{ hypervisor }}"
64+
65+
- name: Display current stage
66+
ansible.builtin.debug:
67+
msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}."
68+
with_items: "{{ active_vms.stdout_lines }}"
69+
70+
- name: Migrate back VMs post-reboot
71+
environment:
72+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
73+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
74+
ansible.builtin.shell: >-
75+
set -o pipefail;
76+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
77+
nova live-migration
78+
{% if 'ceph' not in storage_backend.stdout %}
79+
--block-migrate
80+
{% endif %}
81+
{{ item }} {{ hypervisor }}";
82+
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json |
83+
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
84+
register: instance_migration_result
85+
until: instance_migration_result.stdout.find(hypervisor) > -1
86+
retries: 30
87+
delay: 5
88+
with_items: "{{ active_vms.stdout_lines }}"
89+
when:
90+
- active_vms.stdout != ''
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
---
2+
- name: Display current stage
3+
ansible.builtin.debug:
4+
msg: |
5+
Testing the status of the services for {{ current_hypervisor }} after reboot.
6+
7+
- name: Verify nova-compute services
8+
environment:
9+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
10+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
11+
ansible.builtin.shell: >-
12+
set -o pipefail;
13+
{{ cifmw_update_openstack_cmd }} compute service list
14+
--host {{ current_hypervisor }} -f json
15+
| jq -r -c '.[]
16+
| select(.Binary | contains("nova-compute")) | .State'
17+
register: nova_compute_status
18+
until: nova_compute_status.stdout == 'up'
19+
retries: 30
20+
delay: 5
21+
22+
- name: Verify ovn-controller services
23+
environment:
24+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
25+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
26+
ansible.builtin.shell: >-
27+
set -o pipefail;
28+
{{ cifmw_update_openstack_cmd }} network agent list
29+
--host {{ current_hypervisor }} -f json
30+
| jq -r -c '.[]
31+
| select(.Binary | contains("ovn-controller")) | .Alive'
32+
register: ovn_controller_status
33+
until: ovn_controller_status.stdout == 'true'
34+
retries: 30
35+
delay: 5
36+
37+
- name: Verify networking-ovn-metadata-agent
38+
environment:
39+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
40+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
41+
ansible.builtin.shell: >-
42+
set -o pipefail;
43+
{{ cifmw_update_openstack_cmd }} network agent list
44+
--host {{ current_hypervisor }} -f json
45+
| jq -r -c '.[]
46+
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive'
47+
register: networking_ovn_metadata_status
48+
until: networking_ovn_metadata_status.stdout == 'true'
49+
retries: 30
50+
delay: 5
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
---
2+
- name: Fetch NodeSets for the Reboot OpenStackDataPlaneDeployment
3+
environment:
4+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
5+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
6+
ansible.builtin.shell: >-
7+
set -o pipefail;
8+
oc -n {{ cifmw_update_namespace }}
9+
get openstackdataplanenodeset -o name
10+
| awk -F'/' '{print $2}'
11+
register: cifmw_update_node_sets
12+
changed_when: false
13+
14+
- name: Construct the Reboot CR name
15+
ansible.builtin.set_fact:
16+
cifmw_reboot_dep_name: >-
17+
{{
18+
'reboot-' ~ cifmw_update_hypervisor_short_name ~ '-' ~
19+
lookup('pipe', 'date +%Y%m%d%H%S')
20+
}}
21+
22+
- name: Create the OpenStackDataPlaneDeployment CR used for reboot
23+
ansible.builtin.copy:
24+
dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml"
25+
content: "{{ _content | to_nice_yaml }}"
26+
vars:
27+
_content:
28+
apiVersion: dataplane.openstack.org/v1beta1
29+
kind: OpenStackDataPlaneDeployment
30+
metadata:
31+
name: "{{ cifmw_reboot_dep_name }}"
32+
namespace: "{{ cifmw_update_namespace }}"
33+
spec:
34+
nodeSets: "{{ cifmw_update_node_sets.stdout
35+
| split('\n')
36+
| map('trim')
37+
| reject('equalto', '')
38+
| list
39+
}}"
40+
servicesOverride:
41+
- reboot-os
42+
ansibleExtraVars:
43+
edpm_reboot_strategy: force
44+
ansibleLimit: "{{ cifmw_update_hypervisor_short_name }}"
45+
46+
- name: Create the OpenStackDataPlaneDeployment CR to trigger a reboot
47+
environment:
48+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
49+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
50+
ansible.builtin.command: >-
51+
oc -n {{ cifmw_update_namespace }}
52+
create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml
53+
54+
- name: Wait for the reboot to finish
55+
environment:
56+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
57+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
58+
ansible.builtin.command: >-
59+
oc -n {{ cifmw_update_namespace }}
60+
wait --for=condition=SetupComplete
61+
openstackdataplanedeployment/{{ cifmw_reboot_dep_name }}
62+
--timeout={{ (cifmw_update_wait_retries_reboot | int * 5) }}s
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/bin/bash
2+
3+
set -e
4+
set -o pipefail
5+
6+
servers=(
7+
{% for server in hypervisor_hostnames %}
8+
{{ server.split('.')[0] }}
9+
{% endfor %}
10+
)
11+
12+
log_file="{{ cifmw_update_artifacts_basedir }}/monitor_servers.log"
13+
pid_file="{{ cifmw_update_artifacts_basedir }}/monitor_servers.pid"
14+
15+
# Write the script's PID to the file
16+
echo $$ > "$pid_file"
17+
18+
# Function to check server status via SSH
19+
# TODO: ping always replies even if server is down, so using SSH instead.
20+
check_servers() {
21+
for server in "${servers[@]}"; do
22+
if ssh -i {{ cifmw_update_ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then
23+
# Server is up
24+
if [ "${server_status[$server]}" == "down" ]; then
25+
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file"
26+
server_status[$server]="up"
27+
fi
28+
else
29+
# Server is down
30+
if [ "${server_status[$server]}" != "down" ]; then
31+
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file"
32+
server_status[$server]="down"
33+
fi
34+
fi
35+
done
36+
}
37+
38+
# Function to handle script termination
39+
cleanup() {
40+
TERMINATE=true
41+
echo "$(date '+%Y-%m-%d %H:%M:%S') - Termination signal received, waiting for check_servers to complete..." | tee -a "$log_file"
42+
}
43+
44+
# Trap signals and call cleanup function
45+
trap cleanup SIGINT SIGTERM
46+
47+
# Initialize server status array
48+
declare -A server_status
49+
for server in "${servers[@]}"; do
50+
server_status[$server]="unknown"
51+
done
52+
53+
# Main loop to continuously check server status
54+
while true; do
55+
check_servers
56+
# Handle signal
57+
if [ "$TERMINATE" = true ]; then
58+
echo "$(date '+%Y-%m-%d %H:%M:%S') - Script terminated" | tee -a "$log_file"
59+
rm -f "$pid_file"
60+
exit 0
61+
fi
62+
sleep 1
63+
done

0 commit comments

Comments
 (0)