Skip to content

Commit ff86b85

Browse files
committed
Add the steps to reboot the computes after update.
This sequence implements reboot of the compute nodes after the update. By default it's not run and `cifmw_update_reboot_test` must be set to true to activate it. We have one instance created. If the hypervisor being rebooted has the instance that instance will be live-migrated to another hypervisor before the reboot and migrated back to that original hypervisor after the reboot. Some basic sanity checks are performed after the reboot and before the migration back to ensure that the necessary services are up and running. During the reboot we start two scripts. One monitors and log the reboot of the hypervisors. The other log where the instance is currently running. The log files can be found in `~/ci-framework-data/tests/update/` in `monitor_servers.log` and `monitor_vm_placement.log` respectively. A note about node evacuation. We are still using node evaction from the nova cli. This command has not been ported to the openstack cli. There's a discussion about it [on launchpad](https://bugs.launchpad.net/python-openstackclient/+bug/2055552). Also, we do the evacuation only if there are more than one hypervisor available. When only one compute is available we stop and and after reboot, we just restart the instance. The official documentation mention only the live-migration path, but as we also use the live-migration in the test sequence that part is covered. We still expect customer to use the nova cli as it's way more user friendly and is still currently working. Closes: https://issues.redhat.com/browse/OSPRH-8937
1 parent 53adb63 commit ff86b85

File tree

10 files changed

+429
-2
lines changed

10 files changed

+429
-2
lines changed

roles/update/README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,7 @@ Role to run update
1616
* `cifmw_update_ctl_plane_max_fail`: (Integer) For continuous control plane testing, maximum number of failures allowed. Default to 3.
1717
* `cifmw_update_ctl_plane_max_tries`: (Integer) For continuous control plane testing, number of retries allowed to stop and destroy the last vm created. Each retry is 5 seconds apart. Default to 84, so 7 minutes.
1818
* `cifmw_update_openstackclient_pod_timeout`: (Integer) Maximum number of seconds to wait for the openstackclient Pod to be available during control plane testing, as it is being restarted during update. Default to `10` seconds.
19-
19+
* `cifmw_update_reboot_test`: (Boolean) Activate the reboot test after update. Default to `True`.
20+
* `cifmw_update_ansible_ssh_private_key_file`: (String) Define the path to the private key file used for the compute nodes.
21+
* `cifmw_update_wait_retries_reboot`: (Integer) Number of retries to wait for a compute node reboot. One retry is done every five seconds. Default to 60, so five minutes.
2022
## Examples

roles/update/defaults/main.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,15 @@ cifmw_update_timestamper_cmd: >-
3737
cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_start_ping.sh"
3838
cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh"
3939

40+
# Operation in the openstack namespace
41+
cifmw_update_openstack_cmd: >-
42+
oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack
43+
4044
## User facing
45+
cifmw_update_reboot_test: false
46+
cifmw_update_ansible_ssh_private_key_file: >-
47+
"{{ ansible_ssh_private_key_file | default(ansible_user_dir ~ '/.ssh/id_cifw') }}"
48+
cifmw_update_wait_retries_reboot: 60
4149

4250
cifmw_update_ping_test: false
4351
cifmw_update_create_volume: false

roles/update/molecule/default/prepare.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
- role: ci_setup
2929
- role: install_yamls
3030
tasks:
31-
- name: Set custom cifmw PATH reusable fact
31+
- name: Set custom some reusable facts
3232
ansible.builtin.set_fact:
3333
cifmw_path: "{{ ansible_user_dir }}/.crc/bin:{{ ansible_user_dir }}/.crc/bin/oc:{{ ansible_user_dir }}/bin:{{ ansible_env.PATH }}"
34+
cifmw_update_reboot_test: false
3435
cacheable: true

roles/update/tasks/main.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,8 @@
7474
- not cifmw_update_run_dryrun | bool
7575
ansible.builtin.command: |
7676
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh
77+
78+
- name: Reboot the compute nodes
79+
ansible.builtin.include_tasks: reboot_computes.yml
80+
when:
81+
- cifmw_update_reboot_test | bool
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
- name: Register storage backend type
2+
environment:
3+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
4+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
5+
ansible.builtin.shell: >-
6+
set -o pipefail;
7+
{{ cifmw_update_openstack_cmd }} volume service list -f json |
8+
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host'
9+
register: storage_backend
10+
changed_when: false
11+
12+
- name: Get the list of OpenStack hypervisors
13+
environment:
14+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
15+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
16+
ansible.builtin.shell: |
17+
{{ cifmw_update_openstack_cmd }} hypervisor list -f json
18+
register: hypervisor_list
19+
changed_when: false
20+
21+
- name: Parse the hypervisor list to extract hostnames
22+
ansible.builtin.set_fact:
23+
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}"
24+
25+
- name: Create a reboot monitor servers script
26+
ansible.builtin.template:
27+
src: "monitor_servers.sh.j2"
28+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_servers.sh"
29+
mode: "0775"
30+
31+
- name: Start the monitor servers script
32+
ansible.builtin.shell: |
33+
nohup {{ cifmw_update_artifacts_basedir }}/monitor_servers.sh &> /dev/null &
34+
echo $!
35+
register: monitor_servers_job
36+
37+
- name: Create a monitor placement monitor script
38+
ansible.builtin.template:
39+
src: "monitor_vm_placement.sh.j2"
40+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh"
41+
mode: "0775"
42+
43+
- name: Start the monitor placement script
44+
ansible.builtin.shell: |
45+
nohup {{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh &> /dev/null &
46+
echo $!
47+
register: monitor_placement_job
48+
49+
- name: Iterate over each hypervisor for the reboot sequence
50+
ansible.builtin.include_tasks: reboot_hypervisor.yml
51+
loop: "{{ hypervisor_hostnames }}"
52+
loop_control:
53+
loop_var: hypervisor
54+
55+
- name: Stop the monitor servers script if running
56+
ansible.builtin.shell: |
57+
if kill -0 {{ monitor_servers_job.stdout }} &>/dev/null; then
58+
kill {{ monitor_servers_job.stdout }}
59+
fi
60+
register: kill_result
61+
failed_when: kill_result.rc not in [0, 1] # We can still have a race
62+
# between kill -0 and
63+
# kill, even if unlikely.
64+
65+
- name: Stop the monitor placement script if running
66+
ansible.builtin.shell: |
67+
if kill -0 {{ monitor_placement_job.stdout }} &>/dev/null; then
68+
kill {{ monitor_placement_job.stdout }}
69+
fi
70+
register: kill_result
71+
failed_when: kill_result.rc not in [0, 1]
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
---
2+
- name: Extract short hostname from FQDN
3+
ansible.builtin.set_fact:
4+
cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}"
5+
6+
- name: Display current stage
7+
ansible.builtin.debug:
8+
msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}"
9+
10+
- name: Define command for nova interaction
11+
ansible.builtin.set_fact:
12+
cifmw_update_bash_cmd: >-
13+
oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c
14+
15+
- name: Check active VMs on hypervisor
16+
environment:
17+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
18+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
19+
ansible.builtin.shell: >-
20+
set -o pipefail;
21+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
22+
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID'
23+
register: active_vms
24+
changed_when: false
25+
26+
- name: Evacuate VMs if they are running
27+
environment:
28+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
29+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
30+
ansible.builtin.shell: >-
31+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
32+
nova host-evacuate-live
33+
{% if 'ceph' not in storage_backend.stdout %}
34+
--block-migrate
35+
{% endif %}
36+
{{ hypervisor }}"
37+
when:
38+
- active_vms.stdout != ''
39+
- hypervisor_hostnames | length > 1
40+
changed_when: true
41+
42+
- name: Wait for compute node to get quiesced
43+
environment:
44+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
45+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
46+
ansible.builtin.shell: >-
47+
set -o pipefail;
48+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
49+
| jq -r -c '[.[] | select(.Status |
50+
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))]
51+
| length'
52+
register: compute_node_instances
53+
until: compute_node_instances.stdout.find("0") > -1
54+
retries: 30
55+
delay: 5
56+
when:
57+
- active_vms.stdout != ''
58+
- hypervisor_hostnames | length > 1
59+
60+
- name: Reboot the hypervisors using CR
61+
ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml
62+
63+
- name: Perform sanity checks post-reboot
64+
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml
65+
vars:
66+
current_hypervisor: "{{ hypervisor }}"
67+
68+
- name: Display current stage
69+
ansible.builtin.debug:
70+
msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}."
71+
with_items: "{{ active_vms.stdout_lines }}"
72+
73+
- name: Migrate back VMs post-reboot
74+
environment:
75+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
76+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
77+
ansible.builtin.shell: >-
78+
set -o pipefail;
79+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
80+
nova live-migration
81+
{% if 'ceph' not in storage_backend.stdout %}
82+
--block-migrate
83+
{% endif %}
84+
{{ item }} {{ hypervisor }}";
85+
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json |
86+
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
87+
register: instance_migration_result
88+
until: instance_migration_result.stdout.find(hypervisor) > -1
89+
retries: 30
90+
delay: 5
91+
with_items: "{{ active_vms.stdout_lines }}"
92+
when:
93+
- active_vms.stdout != ''
94+
- hypervisor_hostnames | length > 1
95+
96+
- name: Start VMs post-reboot when only one hypervisor is present
97+
environment:
98+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
99+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
100+
ansible.builtin.shell: >-
101+
set -o pipefail;
102+
{{ cifmw_update_openstack_cmd }} server start {{ item }};
103+
sleep 5;
104+
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json
105+
| jq -r .status
106+
register: instance_migration_result
107+
until: instance_migration_result.stdout == "ACTIVE"
108+
retries: 30
109+
delay: 1 # We already wait 5s in the shell.
110+
with_items: "{{ active_vms.stdout_lines }}"
111+
when:
112+
- active_vms.stdout != ''
113+
- hypervisor_hostnames | length == 1
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
---
2+
- name: Display current stage
3+
ansible.builtin.debug:
4+
msg: |
5+
Testing the status of the services for {{ current_hypervisor }} after reboot.
6+
7+
- name: Verify nova-compute services
8+
environment:
9+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
10+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
11+
ansible.builtin.shell: >-
12+
set -o pipefail;
13+
{{ cifmw_update_openstack_cmd }} compute service list
14+
--host {{ current_hypervisor }} -f json
15+
| jq -r -c '.[]
16+
| select(.Binary | contains("nova-compute")) | .State'
17+
register: nova_compute_status
18+
until: nova_compute_status.stdout == 'up'
19+
retries: 30
20+
delay: 5
21+
22+
- name: Verify ovn-controller services
23+
environment:
24+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
25+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
26+
ansible.builtin.shell: >-
27+
set -o pipefail;
28+
{{ cifmw_update_openstack_cmd }} network agent list
29+
--host {{ current_hypervisor }} -f json
30+
| jq -r -c '.[]
31+
| select(.Binary | contains("ovn-controller")) | .Alive'
32+
register: ovn_controller_status
33+
until: ovn_controller_status.stdout == 'true'
34+
retries: 30
35+
delay: 5
36+
37+
- name: Verify networking-ovn-metadata-agent
38+
environment:
39+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
40+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
41+
ansible.builtin.shell: >-
42+
set -o pipefail;
43+
{{ cifmw_update_openstack_cmd }} network agent list
44+
--host {{ current_hypervisor }} -f json
45+
| jq -r -c '.[]
46+
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive'
47+
register: networking_ovn_metadata_status
48+
until: networking_ovn_metadata_status.stdout == 'true'
49+
retries: 30
50+
delay: 5
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
---
2+
- name: Fetch NodeSets for the Reboot OpenStackDataPlaneDeployment
3+
environment:
4+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
5+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
6+
ansible.builtin.shell: >-
7+
set -o pipefail;
8+
oc -n {{ cifmw_update_namespace }}
9+
get openstackdataplanenodeset -o name
10+
| awk -F'/' '{print $2}'
11+
register: cifmw_update_node_sets
12+
changed_when: false
13+
14+
- name: Construct the Reboot CR name
15+
ansible.builtin.set_fact:
16+
cifmw_reboot_dep_name: >-
17+
{{
18+
'reboot-' ~ cifmw_update_hypervisor_short_name ~ '-' ~
19+
lookup('pipe', 'date +%Y%m%d%H%S')
20+
}}
21+
22+
- name: Create the OpenStackDataPlaneDeployment CR used for reboot
23+
ansible.builtin.copy:
24+
dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml"
25+
content: "{{ _content | to_nice_yaml }}"
26+
vars:
27+
_content:
28+
apiVersion: dataplane.openstack.org/v1beta1
29+
kind: OpenStackDataPlaneDeployment
30+
metadata:
31+
name: "{{ cifmw_reboot_dep_name }}"
32+
namespace: "{{ cifmw_update_namespace }}"
33+
spec:
34+
nodeSets: "{{ cifmw_update_node_sets.stdout
35+
| split('\n')
36+
| map('trim')
37+
| reject('equalto', '')
38+
| list
39+
}}"
40+
servicesOverride:
41+
- reboot-os
42+
ansibleExtraVars:
43+
edpm_reboot_strategy: force
44+
ansibleLimit: "{{ cifmw_update_hypervisor_short_name }}"
45+
46+
- name: Create the OpenStackDataPlaneDeployment CR to trigger a reboot
47+
environment:
48+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
49+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
50+
ansible.builtin.command: >-
51+
oc -n {{ cifmw_update_namespace }}
52+
create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml
53+
54+
- name: Wait for the reboot to finish
55+
environment:
56+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
57+
PATH: "{{ cifmw_path | default(ansible_env.PATH) }}"
58+
ansible.builtin.command: >-
59+
oc -n {{ cifmw_update_namespace }}
60+
wait --for=condition=Ready
61+
openstackdataplanedeployment/{{ cifmw_reboot_dep_name }}
62+
--timeout={{ (cifmw_update_wait_retries_reboot | int * 5) }}s

0 commit comments

Comments
 (0)