From 97e0b8c54d9a46b4c2a459b914a8d62fb0a653de Mon Sep 17 00:00:00 2001 From: Bohdan Dobrelia Date: Mon, 20 Apr 2026 12:54:09 +0200 Subject: [PATCH 1/6] [ci_local_storage] Fall back to oc debug for PV dir creation When no Ansible inventory host matches a k8s node hostname (e.g. bare metal SNO where the node is not SSH-accessible), the role silently skips directory creation while still creating PVs that reference non-existent paths. Add an oc debug fallback that creates directories on each node via a debug pod. Generated-by: claude-4.6-opus-high Signed-off-by: Bohdan Dobrelia --- docs/dictionary/en-custom.txt | 1 + roles/ci_local_storage/README.md | 18 ++++++ roles/ci_local_storage/defaults/main.yml | 1 + .../molecule/default/converge.yml | 62 +++++++++++++++++++ roles/ci_local_storage/tasks/cleanup.yml | 7 +++ roles/ci_local_storage/tasks/main.yml | 5 ++ .../tasks/uncovered_node_dirs.yml | 43 +++++++++++++ 7 files changed, 137 insertions(+) create mode 100644 roles/ci_local_storage/tasks/uncovered_node_dirs.yml diff --git a/docs/dictionary/en-custom.txt b/docs/dictionary/en-custom.txt index 67aae5cf2..3990a5c96 100644 --- a/docs/dictionary/en-custom.txt +++ b/docs/dictionary/en-custom.txt @@ -21,6 +21,7 @@ Marjanovic Nemanja NICs NodeHealthCheck +PV PyYAML RHCOS SNO diff --git a/roles/ci_local_storage/README.md b/roles/ci_local_storage/README.md index 39be46e36..f8ca27be4 100644 --- a/roles/ci_local_storage/README.md +++ b/roles/ci_local_storage/README.md @@ -17,9 +17,12 @@ If apply, please explain the privilege escalation done in this role. * `cifmw_cls_create_ee_storage`: (Bool) Param to create ee_storage. Defaults to `false`. * `cifmw_cls_namespace`: (String) The namespace where OCP resources will be installed. Defaults to `openstack`. * `cifmw_cls_action`: (String) Action to perform, can be `create` or `clean`. Defaults to `create`. +* `cifmw_cls_oc_debug_fallback`: (Bool) Use `oc debug node/` to create PV directories on k8s nodes that are not reachable via SSH from the Ansible inventory. When enabled, the role computes which k8s nodes have no matching SSH-reachable inventory host and falls back to `oc debug` for those nodes. Applies to both create and cleanup. Defaults to `false`. Use it with an SNO BM setup. * `cifmw_cls_storage_manifest`: (Dict) The storage manifest resource to be used to initiate storage class. ## Examples + +### Standard (CRC / VM-based) ```YAML - hosts: localhost vars: @@ -32,3 +35,18 @@ If apply, please explain the privilege escalation done in this role. - ansible.builtin.include_role: name: ci_local_storage ``` + +### Baremetal SNO +On bare-metal Single Node OpenShift the k8s node is typically not present +in the Ansible inventory for SSH access. Enable `cifmw_cls_oc_debug_fallback` +so the role uses `oc debug node/` to manage PV directories instead: +```YAML + - hosts: localhost + vars: + cifmw_openshift_kubeconfig: "{{ ansible_user_dir }}/.kube/kubeconfig" + cifmw_cls_pv_count: 20 + cifmw_cls_oc_debug_fallback: true + tasks: + - ansible.builtin.include_role: + name: ci_local_storage +``` diff --git a/roles/ci_local_storage/defaults/main.yml b/roles/ci_local_storage/defaults/main.yml index c10e036e3..854c15e8c 100644 --- a/roles/ci_local_storage/defaults/main.yml +++ b/roles/ci_local_storage/defaults/main.yml @@ -28,6 +28,7 @@ cifmw_cls_storage_provisioner: cifmw cifmw_cls_create_ee_storage: false cifmw_cls_namespace: openstack cifmw_cls_action: create +cifmw_cls_oc_debug_fallback: false cifmw_cls_storage_manifest: kind: StorageClass diff --git a/roles/ci_local_storage/molecule/default/converge.yml b/roles/ci_local_storage/molecule/default/converge.yml index 74526bc50..eea430d0a 100644 --- a/roles/ci_local_storage/molecule/default/converge.yml +++ b/roles/ci_local_storage/molecule/default/converge.yml @@ -137,3 +137,65 @@ }} ansible.builtin.assert: that: "cifmw_cls_namespace not in ns_names" + + - name: Test oc-debug fallback path (uncovered_node_dirs.yml) + vars: + cifmw_cls_pv_count: 3 + cifmw_cls_local_storage_name: /mnt/openstack-fallback + cifmw_cls_oc_debug_fallback: true + block: + - name: Get k8s node names for fallback test + kubernetes.core.k8s_info: + kubeconfig: "{{ cifmw_openshift_kubeconfig }}" + kind: Node + register: _fb_k8s_nodes + + - name: Simulate no SSH-reachable hosts matching k8s nodes + ansible.builtin.set_fact: + cifmw_ci_local_storage_k8s_hostnames: + - "{{ _fb_k8s_nodes.resources[0].metadata.name }}" + _hostnames: + results: [] + + - name: Run uncovered_node_dirs.yml (create) + vars: + cifmw_cls_action: create + ansible.builtin.include_tasks: + file: "{{ playbook_dir }}/../../tasks/uncovered_node_dirs.yml" + + - name: Assert uncovered nodes were identified + ansible.builtin.assert: + that: + - _cls_uncovered_nodes | length == 1 + + - name: Verify directories created on node + delegate_to: crc + become: true + register: _fb_check + ansible.builtin.stat: + path: "/mnt/openstack-fallback/pv{{ '%02d' | format(item | int) }}" + loop: "{{ range(1, 4) }}" + + - name: Assert all fallback directories exist + ansible.builtin.assert: + that: item.stat.exists + loop: "{{ _fb_check.results }}" + loop_control: + label: "{{ item.invocation.module_args.path }}" + + - name: Run uncovered_node_dirs.yml (cleanup) + vars: + cifmw_cls_action: clean + ansible.builtin.include_tasks: + file: "{{ playbook_dir }}/../../tasks/uncovered_node_dirs.yml" + + - name: Verify fallback directories removed + delegate_to: crc + become: true + register: _fb_removed + ansible.builtin.stat: + path: "/mnt/openstack-fallback" + + - name: Assert fallback directory tree is gone + ansible.builtin.assert: + that: not _fb_removed.stat.exists diff --git a/roles/ci_local_storage/tasks/cleanup.yml b/roles/ci_local_storage/tasks/cleanup.yml index 111470e98..f22b6db7c 100644 --- a/roles/ci_local_storage/tasks/cleanup.yml +++ b/roles/ci_local_storage/tasks/cleanup.yml @@ -55,6 +55,13 @@ loop_control: loop_var: host +- name: Delete PV directories on nodes unreachable via SSH + vars: + cifmw_cls_action: "clean" + when: + - cifmw_cls_oc_debug_fallback | bool + ansible.builtin.include_tasks: uncovered_node_dirs.yml + - name: Remove the cifmw_cls_namespace namespace kubernetes.core.k8s: state: absent diff --git a/roles/ci_local_storage/tasks/main.yml b/roles/ci_local_storage/tasks/main.yml index 169f58161..02fdd96da 100644 --- a/roles/ci_local_storage/tasks/main.yml +++ b/roles/ci_local_storage/tasks/main.yml @@ -52,6 +52,11 @@ loop_control: loop_var: host +- name: Manage PV directories on nodes unreachable via SSH + when: + - cifmw_cls_oc_debug_fallback | bool + ansible.builtin.include_tasks: uncovered_node_dirs.yml + - name: Generate pv related storage manifest file ansible.builtin.template: src: storage.yaml.j2 diff --git a/roles/ci_local_storage/tasks/uncovered_node_dirs.yml b/roles/ci_local_storage/tasks/uncovered_node_dirs.yml new file mode 100644 index 000000000..02d5e1c7a --- /dev/null +++ b/roles/ci_local_storage/tasks/uncovered_node_dirs.yml @@ -0,0 +1,43 @@ +--- +- name: Identify k8s nodes not reachable via SSH (SNO BM) + vars: + _ssh_covered: >- + {{ + _hostnames.results | + default([]) | + selectattr('stdout', 'defined') | + map(attribute='stdout') | + list + }} + ansible.builtin.set_fact: + _cls_uncovered_nodes: >- + {{ + cifmw_ci_local_storage_k8s_hostnames | + difference(_ssh_covered) + }} + +- name: Manage PV directories via oc debug for unreachable nodes + when: + - _cls_uncovered_nodes | length > 0 + vars: + _action_script: >- + {% if cifmw_cls_action == 'create' %} + for i in $(seq 1 {{ cifmw_cls_pv_count | int }}); + do d=$(printf '%02d' "$i"); + mkdir -p '{{ cifmw_cls_local_storage_name }}'/pv$d && + chmod 0775 '{{ cifmw_cls_local_storage_name }}'/pv$d; done + {% else %} + rm -rf '{{ cifmw_cls_local_storage_name }}' + {% endif %} + ansible.builtin.command: + cmd: >- + oc debug node/{{ node_name }} + --kubeconfig={{ cifmw_openshift_kubeconfig }} + -- chroot /host bash -c "{{ _action_script }}" + register: _cls_oc_debug + retries: 3 + delay: 10 + until: _cls_oc_debug.rc == 0 + loop: "{{ _cls_uncovered_nodes }}" + loop_control: + loop_var: node_name From a0449871475f9053e51f1054d1182f8a48e7bc78 Mon Sep 17 00:00:00 2001 From: Bohdan Dobrelia Date: Tue, 17 Mar 2026 13:32:28 +0100 Subject: [PATCH 2/6] [ci_gen_kustomize_values] nova05epsilon: ceph conf The ceph.yml post_stage_run hook (via cifmw_ceph_client role) writes Ceph config files to cifmw_ceph_client_fetch_dir (default /tmp/). This template reads those files and provides them as base64-encoded values under data.ceph_conf (DCN convention). Generated-by: claude-4.6-opus-high Signed-off-by: Bohdan Dobrelia --- .../edpm-nodeset-values-post-ceph/values.yaml.j2 | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 roles/ci_gen_kustomize_values/templates/nova05epsilon/edpm-nodeset-values-post-ceph/values.yaml.j2 diff --git a/roles/ci_gen_kustomize_values/templates/nova05epsilon/edpm-nodeset-values-post-ceph/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/nova05epsilon/edpm-nodeset-values-post-ceph/values.yaml.j2 new file mode 100644 index 000000000..002e4d8b9 --- /dev/null +++ b/roles/ci_gen_kustomize_values/templates/nova05epsilon/edpm-nodeset-values-post-ceph/values.yaml.j2 @@ -0,0 +1,15 @@ +--- +# source: nova05epsilon/edpm-nodeset-values-post-ceph/values.yaml.j2 +# Auto-populates ceph_conf from files written by ceph.yml hook. +# The ceph.yml post_stage_run hook (via cifmw_ceph_client role) writes +# Ceph config files to cifmw_ceph_client_fetch_dir (default /tmp/). +# This template reads those files and provides them as base64-encoded +# values under data.ceph_conf (DCN convention). +{% set _fetch_dir = cifmw_ceph_client_fetch_dir | default('/tmp') %} +{% set _cluster = cifmw_ceph_client_cluster | default('ceph') %} +{% set _conf_file = (_fetch_dir, _cluster ~ '.conf') | path_join %} +{% set _keyring_file = (_fetch_dir, _cluster ~ '.client.openstack.keyring') | path_join %} +data: + ceph_conf: + {{ _cluster }}.client.openstack.keyring: {{ lookup('file', _keyring_file, rstrip=False) | b64encode }} + {{ _cluster }}.conf: {{ lookup('file', _conf_file, rstrip=False) | b64encode }} From 49f537a71da1f19c450e10e77cbd3121d9f87cb6 Mon Sep 17 00:00:00 2001 From: Bohdan Dobrelia Date: Thu, 14 May 2026 10:17:37 +0200 Subject: [PATCH 3/6] [ceph] Allow overriding ssh and storage_mgmt To allow BM SNO with ceph using custom ceph CIDR values, make ssh_network_range and storage_mgmt_network_range overridable via cifmw_ceph_ssh_network_range and cifmw_ceph_storage_mgmt_network_range. Both are set in set_fact which clobbers extra vars, so we use the cifmw_ indirection with default() to preserve original defaults. NOTE: storage_network_range also needs this treatment. It use to be commented out in set_fact, and this change needs extra testing with Ceph ci jobs perhaps. Also gather network facts for IP-to-host mapping. Signed-off-by: Bohdan Dobrelia --- hooks/playbooks/ceph.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hooks/playbooks/ceph.yml b/hooks/playbooks/ceph.yml index 13f5ce610..dd3aa4822 100644 --- a/hooks/playbooks/ceph.yml +++ b/hooks/playbooks/ceph.yml @@ -104,6 +104,10 @@ gather_facts: false become: true tasks: + - name: Gather network facts for IP-to-host mapping + ansible.builtin.setup: + gather_subset: + - network # jq is normally installed by cifmw_block_device role, but when cifmw_ceph_spec_data_devices # is defined (indicating block devices are already present), the block device creation play # is skipped. Install jq explicitly here to ensure it's available for ceph operations. @@ -186,9 +190,12 @@ when: - not cifmw_ceph_ipv6 | default(false) ansible.builtin.set_fact: - ssh_network_range: 192.168.122.0/24 - # storage_network_range: 172.18.0.0/24 - storage_mgmt_network_range: 172.20.0.0/24 + ssh_network_range: >- + {{ cifmw_ceph_ssh_network_range | default('192.168.122.0/24') }} + storage_network_range: >- + {{ cifmw_ceph_storage_network_range | default('172.18.0.0/24') }} + storage_mgmt_network_range: >- + {{ cifmw_ceph_storage_mgmt_network_range | default('172.20.0.0/24') }} all_addresses: ansible_all_ipv4_addresses ms_bind_ipv4: true ms_bind_ipv6: false From def3fbd6528d01428746a1ebd7a63fa15515036b Mon Sep 17 00:00:00 2001 From: Bohdan Dobrelia Date: Wed, 3 Jun 2026 17:42:24 +0200 Subject: [PATCH 4/6] [swift] add hook for Swift and Ceph RWG on SNO On SNO with a single EDPM compute (single-host CephHCI), the Ceph ingress service (haproxy/keepalived) is not deployed because the ceph_rgw.yml.j2 spec template only creates it for multi-host clusters. Add a hook to correct the Keystone Swift endpoint for this case. The proper fix belongs in cifmw_cephadm/tasks/configure_object.yml which should detect whether ingress is deployed and choose VIP:8080 vs host_ip:8082 accordingly. Signed-off-by: Bohdan Dobrelia --- hooks/playbooks/fix_swift_endpoint.yml | 92 ++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 hooks/playbooks/fix_swift_endpoint.yml diff --git a/hooks/playbooks/fix_swift_endpoint.yml b/hooks/playbooks/fix_swift_endpoint.yml new file mode 100644 index 000000000..381eb1492 --- /dev/null +++ b/hooks/playbooks/fix_swift_endpoint.yml @@ -0,0 +1,92 @@ +--- +# Workaround: fix Swift (RGW) endpoint IP and port in Keystone. +# +# The cifmw_cephadm configure_object.yml registers the endpoint using +# cifmw_cephadm_rgw_vip:8080, which assumes the ceph ingress service +# (haproxy+keepalived) is deployed. On single-host HCI (no ingress), +# the VIP is never bound and port 8080 never listens — RGW is directly +# reachable on the host's storage IP at port 8082. +# +# This hook discovers the actual RGW address and port from the running +# ceph cluster and updates the Keystone endpoints to match. +# +# FIXME(ci-framework): The proper fix belongs in +# cifmw_cephadm/tasks/configure_object.yml — it should detect whether +# ingress is deployed and choose VIP:8080 vs host_ip:8082 accordingly. +- name: Fix Swift endpoint to match actual RGW address + hosts: "{{ groups[cifmw_ceph_target | default('computes')] | first }}" + gather_facts: false + vars: + _target_group: "{{ cifmw_ceph_target | default('computes') }}" + _target: "{{ groups[_target_group] | default([]) | first }}" + ansible_ssh_private_key_file: >- + {{ + hostvars[_target]['ansible_ssh_private_key_file'] | + default(lookup('env', 'ANSIBLE_SSH_PRIVATE_KEY')) + }} + tasks: + - name: Get RGW daemon endpoint from ceph + become: true + ansible.builtin.shell: | + set -euo pipefail + cephadm shell -- ceph orch ps --daemon-type rgw --format json 2>/dev/null + register: _rgw_ps + + - name: Get ingress service status + become: true + ansible.builtin.shell: | + set -euo pipefail + cephadm shell -- ceph orch ls --service-type ingress --format json 2>/dev/null + register: _ingress_ls + + - name: Set RGW endpoint facts + vars: + _rgw_daemons: "{{ _rgw_ps.stdout | from_json }}" + _ingress_services: "{{ _ingress_ls.stdout | from_json }}" + _has_ingress: >- + {{ _ingress_services | length > 0 and + (_ingress_services | first).status.running | default(0) | int > 0 }} + block: + - name: Determine endpoint from ingress VIP + when: _has_ingress | bool + ansible.builtin.set_fact: + _rgw_port: "{{ (_ingress_services | first).spec.frontend_port | default(8080) }}" + _rgw_ip: >- + {{ (_ingress_services | first).status.virtual_ip | + regex_replace('/.*$', '') }} + + - name: Determine endpoint from RGW daemon + when: not (_has_ingress | bool) + ansible.builtin.set_fact: + _rgw_port: "{{ (_rgw_daemons | first).ports | first }}" + _rgw_ip: "{{ (_rgw_daemons | first).ip | default(ansible_host) }}" + + - name: Update Swift endpoints in Keystone + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}" + delegate_to: localhost + block: + - name: Get current Swift endpoints + ansible.builtin.shell: | + set -euo pipefail + oc -n {{ cifmw_cephadm_ns | default('openstack') }} \ + exec -t openstackclient -- \ + openstack endpoint list --service object-store -f json + register: _swift_eps + + - name: Update each Swift endpoint URL + vars: + _eps: "{{ _swift_eps.stdout | from_json }}" + _url_prefix: "http://{{ _rgw_ip }}:{{ _rgw_port }}" + ansible.builtin.shell: | + set -euo pipefail + oc -n {{ cifmw_cephadm_ns | default('openstack') }} \ + exec -t openstackclient -- \ + openstack endpoint set \ + --url '{{ _url_prefix }}/swift/v1/AUTH_%(tenant_id)s' \ + {{ item.ID }} + loop: "{{ _eps }}" + loop_control: + label: "{{ item.Interface }}" + when: >- + _url_prefix not in (item.URL | default('')) From 7d3955d732a2b7a8bda60ee3ad44ba435b905f03 Mon Sep 17 00:00:00 2001 From: Bohdan Dobrelia Date: Wed, 27 May 2026 19:31:03 +0200 Subject: [PATCH 5/6] [multiple] Add thin-provisioned LV support for Ceph OSDs Modify cifmw_block_device role to support loop back and thin LVM devices. For ceph deploy hook, when cifmw_ceph_thin_pool is defined, create thin LVs from an existing VG thin pool instead of loop-backed files. This supports baremetal hosts where the disk is fully consumed by a thin pool with no raw devices available for cephadm. The new play runs before the existing loop device play and sets cifmw_ceph_spec_data_devices with explicit LV paths, which also causes the loop device play to be skipped. Signed-off-by: Bohdan Dobrelia --- hooks/playbooks/ceph.yml | 14 ++++ roles/cifmw_block_device/README.md | 72 ++++++++++++++++----- roles/cifmw_block_device/defaults/main.yml | 8 +++ roles/cifmw_block_device/tasks/cleanup.yml | 19 +++++- roles/cifmw_block_device/tasks/loop.yml | 74 ++++++++++++++++++++++ roles/cifmw_block_device/tasks/main.yml | 57 ++--------------- roles/cifmw_block_device/tasks/thin.yml | 42 ++++++++++++ 7 files changed, 218 insertions(+), 68 deletions(-) create mode 100644 roles/cifmw_block_device/tasks/loop.yml create mode 100644 roles/cifmw_block_device/tasks/thin.yml diff --git a/hooks/playbooks/ceph.yml b/hooks/playbooks/ceph.yml index dd3aa4822..b3113044c 100644 --- a/hooks/playbooks/ceph.yml +++ b/hooks/playbooks/ceph.yml @@ -163,6 +163,9 @@ hostvars[_target]['ansible_ssh_private_key_file'] | default(lookup('env', 'ANSIBLE_SSH_PRIVATE_KEY')) }} + cifmw_block_device_thin_pool: "{{ cifmw_ceph_thin_pool | default('') }}" + cifmw_block_device_thin_lv_size: "{{ cifmw_ceph_thin_lv_size | default('50G') }}" + cifmw_block_device_thin_lv_name: "ceph_osd_{{ i }}" cifmw_block_device_image_file: /var/lib/ceph-osd-{{ i }}.img cifmw_block_device_loop: /dev/loop{{ i + 3 }} cifmw_block_lv_name: ceph_lv{{ i }} @@ -174,6 +177,17 @@ loop_var: i loop: "{{ range(0, cifmw_num_osds_perhost|int) }}" + - name: Build data_devices for ceph spec from cifmw_block_device outputs + ansible.builtin.set_fact: + cifmw_ceph_spec_data_devices: | + data_devices: + paths: + {% for p in cifmw_block_device_paths %} + - {{ p }} + {% endfor %} + delegate_to: localhost + delegate_facts: true + - name: Build Ceph spec and conf from gathered IPs of the target inventory group tags: spec hosts: localhost diff --git a/roles/cifmw_block_device/README.md b/roles/cifmw_block_device/README.md index 54f7eaafe..80ca68908 100644 --- a/roles/cifmw_block_device/README.md +++ b/roles/cifmw_block_device/README.md @@ -1,25 +1,31 @@ # cifmw_block_device -Creates a virtual block device with logical volumes. Useful for -deploying Ceph on a virtual machine which does not have any -block devices except for root. Creates a systemd unit so the -virtual block device comes back online during reboot. +Creates block devices with logical volumes for Ceph OSD testing. +Supports two modes: -The target system must have 7 GB of available disk space at minimum. +- **Loop mode** (default): creates a loop-backed file with LVM on top + and a systemd unit to restore it across reboots. Useful for VMs + that have no spare block devices. +- **Thin-pool mode**: creates thin-provisioned LVs from an existing + VG thin pool. Useful for bare-metal hosts that already have a + thin pool with available space. -This role will recreate the block device on each run. Thus, if there -is data on the block device from the previous run it will delete it. -The assumption is that the block device exists for testing and that -rebuilding the environment quickly is more important preserving any -test data. +The mode is selected by `cifmw_block_device_thin_pool`: when non-empty +the role uses thin-pool mode, otherwise loop mode. ## Privilege escalation -Requires root on the remote system to create loop back device and -systemd unit. +Requires root on the remote system to create devices and LVM objects. ## Parameters +### Common + +* `cifmw_block_device_thin_pool`: VG/pool path for thin-pool mode, + e.g. `vg/lv_thinpool`. When empty (default), loop mode is used. + +### Loop mode + * `cifmw_block_device_image_file`: Name of the `dd'd` image file (default `/var/lib/ceph-osd.img`) * `cifmw_block_device_size`: Size of the virtual block device (default @@ -34,8 +40,25 @@ systemd unit. restores the device on system startup (default `/etc/systemd/system/ceph-osd-losetup.service`) +### Thin-pool mode + +* `cifmw_block_device_thin_lv_size`: Size of each thin LV (default + `50G`) +* `cifmw_block_device_thin_lv_name`: Name of the thin LV to create + (default `ceph_osd`) + +## Output + +Both modes append to the `cifmw_block_device_paths` list fact. +Each role invocation adds one entry, so when called in a loop +the list accumulates all created device paths (e.g. +`["/dev/ceph_vg0/ceph_lv0", "/dev/ceph_vg1/ceph_lv1"]` or +`["/dev/vg/ceph_osd_0", "/dev/vg/ceph_osd_1"]`). + ## Examples +### Loop mode (default) + The following will create a 7 GB block device on the target system using the defaults above. ``` @@ -60,11 +83,30 @@ data_devices: paths: - /dev/ceph_vg/ceph_lv_data ``` -The following will stop and disable the systemd unit file which starts -the virtual block device, remove the logical volume, volume group, and -physical volume, and delete the loopback device and its backing file. + +### Thin-pool mode + +```yaml +- include_role: + name: cifmw_block_device + vars: + cifmw_block_device_thin_pool: "vg/lv_thinpool" + cifmw_block_device_thin_lv_size: "50G" + cifmw_block_device_thin_lv_name: "ceph_osd_0" +``` +Ceph spec entry: +```yaml +data_devices: + paths: + - /dev/vg/ceph_osd_0 ``` + +### Cleanup + +```yaml - import_role: name: cifmw_block_device tasks_from: cleanup ``` +For thin-pool mode, pass `cifmw_block_device_thin_pool` and +`cifmw_block_device_thin_lv_name` so the correct cleanup path runs. diff --git a/roles/cifmw_block_device/defaults/main.yml b/roles/cifmw_block_device/defaults/main.yml index aa2eb9764..d3bca17a2 100644 --- a/roles/cifmw_block_device/defaults/main.yml +++ b/roles/cifmw_block_device/defaults/main.yml @@ -18,9 +18,17 @@ # All variables intended for modification should be placed in this file. # All variables within this role should have a prefix of "cifmw_block_device" +# --- Loop-device mode (default) --- cifmw_block_device_image_file: /var/lib/ceph-osd.img cifmw_block_device_size: 7G cifmw_block_device_loop: /dev/loop3 cifmw_block_vg_name: ceph_vg cifmw_block_lv_name: ceph_lv_data cifmw_block_systemd_unit_file: /etc/systemd/system/ceph-osd-losetup.service + +# --- Thin-pool mode --- +# When non-empty, create thin-provisioned LVs from this existing thin pool +# instead of loop-backed files. Value is "VG/pool", e.g. "vg/lv_thinpool". +cifmw_block_device_thin_pool: "" +cifmw_block_device_thin_lv_size: "50G" +cifmw_block_device_thin_lv_name: "ceph_osd" diff --git a/roles/cifmw_block_device/tasks/cleanup.yml b/roles/cifmw_block_device/tasks/cleanup.yml index 721a74577..43ef1b34a 100644 --- a/roles/cifmw_block_device/tasks/cleanup.yml +++ b/roles/cifmw_block_device/tasks/cleanup.yml @@ -14,7 +14,22 @@ # License for the specific language governing permissions and limitations # under the License. -- name: Ensure ceph-osd-losetup is not running and disabled +- name: Clean up thin-provisioned LV + when: cifmw_block_device_thin_pool | length > 0 + become: true + ansible.builtin.command: + cmd: >- + lvremove --force + /dev/{{ cifmw_block_device_thin_pool.split('/')[0] }}/{{ cifmw_block_device_thin_lv_name }} + register: _thin_cleanup + failed_when: + - _thin_cleanup.rc != 0 + - "'not found' not in (_thin_cleanup.stderr | default(''))" + +- name: Clean up loop-backed block device + when: cifmw_block_device_thin_pool | default('') | length == 0 + block: + - name: Ensure ceph-osd-losetup is not running and disabled tags: systemd become: true ansible.builtin.systemd: @@ -37,7 +52,7 @@ pvremove --force {{ cifmw_block_device_loop }} lvs -- name: Use losetup and rm to cremove the loop device and backing image file +- name: Use losetup and rm to remove the loop device and backing image file become: true ansible.builtin.shell: cmd: |- diff --git a/roles/cifmw_block_device/tasks/loop.yml b/roles/cifmw_block_device/tasks/loop.yml new file mode 100644 index 000000000..bc54afac5 --- /dev/null +++ b/roles/cifmw_block_device/tasks/loop.yml @@ -0,0 +1,74 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Install packages + become: true + ansible.builtin.dnf: + name: + - util-linux + - lvm2 + - jq + - podman + state: present + +- name: Stat loop device see if it already exists + ansible.builtin.stat: + path: "{{ cifmw_block_device_loop }}" + register: cifmw_block_device_loop_res + +- name: Log to syslog if loop device exists + community.general.syslogger: + msg: "Warning {{ cifmw_block_device_loop }} already exists" + when: cifmw_block_device_loop_res.stat.exists + +- name: Use dd and losetup to create the loop device + become: true + ansible.builtin.shell: + cmd: |- + dd if=/dev/zero of={{ cifmw_block_device_image_file }} bs=1 count=0 seek={{ cifmw_block_device_size }} + losetup {{ cifmw_block_device_loop }} {{ cifmw_block_device_image_file }} + lsblk + +- name: Use {pv,vg,lv}create to create logical volume on loop device + become: true + ansible.builtin.shell: + cmd: |- + pvcreate {{ cifmw_block_device_loop }} + vgcreate {{ cifmw_block_vg_name }} {{ cifmw_block_device_loop }} + lvcreate -n {{ cifmw_block_lv_name }} -l +100%FREE {{ cifmw_block_vg_name }} + lvs + +- name: Create a systemd service that restores the device on startup + become: true + ansible.builtin.template: + src: templates/ceph-osd-losetup.service.j2 + dest: "{{ cifmw_block_systemd_unit_file }}" + mode: "0644" + force: true + +- name: Ensure ceph-osd-losetup is running and enabled + become: true + tags: systemd + ansible.builtin.systemd: + state: started + enabled: true + name: "{{ cifmw_block_systemd_unit_file | regex_replace('/etc/systemd/system/', '') }}" + +- name: Collect created device path + ansible.builtin.set_fact: + cifmw_block_device_paths: >- + {{ (cifmw_block_device_paths | default([])) + + ['/dev/' ~ cifmw_block_vg_name ~ '/' ~ cifmw_block_lv_name] }} diff --git a/roles/cifmw_block_device/tasks/main.yml b/roles/cifmw_block_device/tasks/main.yml index 9d2e4ff97..9706a0c7b 100644 --- a/roles/cifmw_block_device/tasks/main.yml +++ b/roles/cifmw_block_device/tasks/main.yml @@ -14,55 +14,10 @@ # License for the specific language governing permissions and limitations # under the License. -- name: Install packages - become: true - ansible.builtin.dnf: - name: - - util-linux - - lvm2 - - jq - - podman - state: present +- name: Create block device using thin-pool LV + when: cifmw_block_device_thin_pool | length > 0 + ansible.builtin.include_tasks: thin.yml -- name: Stat loop device see if it already exists - ansible.builtin.stat: - path: "{{ cifmw_block_device_loop }}" - register: cifmw_block_device_loop_res - -- name: Log to syslog if loop device exists - community.general.syslogger: - msg: "Warning {{ cifmw_block_device_loop }} already exists" - when: cifmw_block_device_loop_res.stat.exists - -- name: Use dd and losetup to create the loop device - become: true - ansible.builtin.shell: - cmd: |- - dd if=/dev/zero of={{ cifmw_block_device_image_file }} bs=1 count=0 seek={{ cifmw_block_device_size }} - losetup {{ cifmw_block_device_loop }} {{ cifmw_block_device_image_file }} - lsblk - -- name: Use {pv,vg,lv}create to create logical volume on loop device - become: true - ansible.builtin.shell: - cmd: |- - pvcreate {{ cifmw_block_device_loop }} - vgcreate {{ cifmw_block_vg_name }} {{ cifmw_block_device_loop }} - lvcreate -n {{ cifmw_block_lv_name }} -l +100%FREE {{ cifmw_block_vg_name }} - lvs - -- name: Create a systemd service that restores the device on startup - become: true - ansible.builtin.template: - src: templates/ceph-osd-losetup.service.j2 - dest: "{{ cifmw_block_systemd_unit_file }}" - mode: "0644" - force: true - -- name: Ensure ceph-osd-losetup is running and enabled - become: true - tags: systemd - ansible.builtin.systemd: - state: started - enabled: true - name: "{{ cifmw_block_systemd_unit_file | regex_replace('/etc/systemd/system/', '') }}" +- name: Create block device using loop-backed file + when: cifmw_block_device_thin_pool | length == 0 + ansible.builtin.include_tasks: loop.yml diff --git a/roles/cifmw_block_device/tasks/thin.yml b/roles/cifmw_block_device/tasks/thin.yml new file mode 100644 index 000000000..16304fc40 --- /dev/null +++ b/roles/cifmw_block_device/tasks/thin.yml @@ -0,0 +1,42 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Install lvm2 for thin-pool operations + become: true + ansible.builtin.dnf: + name: + - lvm2 + - jq + state: present + +- name: Create thin LV from existing thin pool + become: true + ansible.builtin.command: + cmd: >- + lvcreate -V {{ cifmw_block_device_thin_lv_size }} + --thin -n {{ cifmw_block_device_thin_lv_name }} + {{ cifmw_block_device_thin_pool }} + creates: "/dev/{{ cifmw_block_device_thin_pool.split('/')[0] }}/{{ cifmw_block_device_thin_lv_name }}" + register: _thin_lvcreate_result + failed_when: + - _thin_lvcreate_result.rc != 0 + - "'already exists' not in (_thin_lvcreate_result.stderr | default(''))" + +- name: Collect created device path + ansible.builtin.set_fact: + cifmw_block_device_paths: >- + {{ (cifmw_block_device_paths | default([])) + + ['/dev/' ~ cifmw_block_device_thin_pool.split('/')[0] ~ '/' ~ cifmw_block_device_thin_lv_name] }} From 7a5127329710e3c735d14d300d937619dabafc0b Mon Sep 17 00:00:00 2001 From: Bohdan Dobrelia Date: Mon, 25 May 2026 18:08:59 +0200 Subject: [PATCH 6/6] [bm_sno] Allow SNO deploy for LVMS (leave free space) Modify bm_sno role to let it provision thin volumes by LVM Storage operator for a proper bare metal performance in BM SNO CI jobs. Signed-off-by: Bohdan Dobrelia --- docs/dictionary/en-custom.txt | 2 + roles/bm_sno/README.md | 44 ++++++++++++++++++- roles/bm_sno/defaults/main.yml | 11 +++++ roles/bm_sno/tasks/main.yml | 28 ++++++++++++ .../lvms_partition_machineconfig.yaml.j2 | 18 ++++++++ 5 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 roles/bm_sno/templates/lvms_partition_machineconfig.yaml.j2 diff --git a/docs/dictionary/en-custom.txt b/docs/dictionary/en-custom.txt index 3990a5c96..a65c8d23a 100644 --- a/docs/dictionary/en-custom.txt +++ b/docs/dictionary/en-custom.txt @@ -555,6 +555,7 @@ rnkhwaejdughvnuzsdz ro rolename rootdevicehints +rootfs rpms rpmss rsa @@ -638,6 +639,7 @@ undercloud unicast unittest unmanaged +unallocated uoyt uri usermod diff --git a/roles/bm_sno/README.md b/roles/bm_sno/README.md index 259c7214f..42b394a8f 100644 --- a/roles/bm_sno/README.md +++ b/roles/bm_sno/README.md @@ -62,6 +62,7 @@ provision IP via `/etc/hosts` entries managed by the role. | `cifmw_bm_agent_core_password` | str | — | Set a `core` user password post-install via MachineConfig | | `cifmw_bm_agent_live_debug` | bool | `false` | Patch the agent ISO with password, autologin, and systemd debug shell on `tty6` for discovery-phase console access (requires `cifmw_bm_agent_core_password`) | | `cifmw_bm_agent_disabled_ifaces` | list | `[]` | Extra NIC names to disable IPv4/IPv6 on during agent-based install. Prevents overlapping-subnet validation failures when multiple NICs share a native VLAN (e.g. `[eno2]`). The interfaces stay link-up but get no IP address; post-install NNCP configures them. | +| `cifmw_bm_agent_lvms_partition` | dict | `{}` | When set, creates an Ignition partition at install time to cap CoreOS rootfs growth and leave unallocated space for the LVMS StorageClass. Keys: `device` (required, e.g. `/dev/nvme0n1`), `rootfs_mib` (default `150000`), `size_mib` (default `0` = rest of disk), `label` (default `lvmstorage`). See [LVMS partition](#lvms-partition). | ## Secrets management @@ -119,7 +120,8 @@ If the binary already exists in the working directory it is reused. 2. Ensure `GenericUsbBoot` is enabled in BIOS (auto-enable with power cycle if allowed) 3. Power off the host 4. Generate SSH keys, template `install-config.yaml` and `agent-config.yaml` -5. Acquire `openshift-install` binary (see above) and run `openshift-install agent create image` to build the agent ISO +5. Optionally generate an LVMS partition MachineConfig into `openshift/` manifests +6. Acquire `openshift-install` binary (see above) and run `openshift-install agent create image` to build the agent ISO 6. Optionally patch the ISO for discovery-phase console access 7. Serve the ISO via a root podman httpd container (rootless podman cannot use privileged ports) 8. Eject any existing VirtualMedia, then insert the agent ISO @@ -267,6 +269,46 @@ cifmw_bm_agent_core_password: changeme cifmw_bm_agent_live_debug: true ``` +## LVMS partition + +By default CoreOS expands its rootfs partition to fill the entire disk +at first boot. To reserve space for the LVMS (Logical Volume Manager +Storage) StorageClass, set `cifmw_bm_agent_lvms_partition` with at +least the `device` key. The role injects a MachineConfig manifest into +the agent ISO that creates a labeled partition via Ignition — before +`growfs` runs — so CoreOS rootfs stops at `rootfs_mib` and the +remainder is available for LVMS. + +```yaml +cifmw_bm_agent_lvms_partition: + device: /dev/disk/by-path/pci-0000:65:00.0-scsi-0:3:111:0 + rootfs_mib: 150000 # ~150 GB for CoreOS (minimum 25000) + size_mib: 0 # 0 = rest of disk + label: lvmstorage # partition label +``` + +After OCP is installed, create an `LVMCluster` CR that targets the +partition by label: + +```yaml +apiVersion: lvm.topolvm.io/v1alpha1 +kind: LVMCluster +metadata: + name: lvmcluster + namespace: openshift-storage +spec: + storage: + deviceClasses: + - name: lvmstorage + deviceSelector: + paths: + - /dev/disk/by-partlabel/lvmstorage + thinPoolConfig: + name: thin-pool + overprovisionRatio: 10 + sizePercent: 90 +``` + ## References * [ci-framework reproducer documentation](https://ci-framework.readthedocs.io/en/latest/roles/reproducer.html) diff --git a/roles/bm_sno/defaults/main.yml b/roles/bm_sno/defaults/main.yml index 69d846b00..d0fd39cf4 100644 --- a/roles/bm_sno/defaults/main.yml +++ b/roles/bm_sno/defaults/main.yml @@ -7,3 +7,14 @@ cifmw_bm_agent_live_debug: false cifmw_bm_agent_vmedia_uefi_path: "" cifmw_bm_agent_enable_usb_boot: true cifmw_bm_agent_disabled_ifaces: [] + +# LVMS partition: restrict CoreOS rootfs growth and leave +# unallocated space for the LVMS StorageClass. +# Set cifmw_bm_agent_lvms_partition to enable. +# Example: +# cifmw_bm_agent_lvms_partition: +# device: /dev/disk/by-path/pci-0000:65:00.0-scsi-0:3:111:0 +# rootfs_mib: 150000 # rootfs capped at ~150 GB +# size_mib: 0 # 0 = rest of disk +# label: lvmstorage +cifmw_bm_agent_lvms_partition: {} diff --git a/roles/bm_sno/tasks/main.yml b/roles/bm_sno/tasks/main.yml index 02958f470..8c472eb92 100644 --- a/roles/bm_sno/tasks/main.yml +++ b/roles/bm_sno/tasks/main.yml @@ -219,6 +219,34 @@ regexp: '^pullSecret:' line: "pullSecret: ''" +- name: Create LVMS partition MachineConfig manifest + when: cifmw_bm_agent_lvms_partition | default({}) | length > 0 + block: + - name: Validate LVMS partition device is set + ansible.builtin.assert: + that: + - cifmw_bm_agent_lvms_partition.device is defined + - cifmw_bm_agent_lvms_partition.device | length > 0 + - (cifmw_bm_agent_lvms_partition.rootfs_mib | default(150000) | int) >= 25000 + fail_msg: >- + cifmw_bm_agent_lvms_partition.device must be set + (e.g. /dev/nvme0n1, /dev/disk/by-path/...), and + its size must be at least 25000 MB. + + - name: Create openshift manifests directory + ansible.builtin.file: + path: "{{ _work_dir }}/openshift" + state: directory + mode: "0755" + + - name: Template LVMS partition MachineConfig + vars: + _lvms: "{{ cifmw_bm_agent_lvms_partition }}" + ansible.builtin.template: + src: lvms_partition_machineconfig.yaml.j2 + dest: "{{ _work_dir }}/openshift/98-lvms-partition.yaml" + mode: "0644" + - name: Ensure nmstatectl is available for agent-config networkConfig validation become: true ansible.builtin.package: diff --git a/roles/bm_sno/templates/lvms_partition_machineconfig.yaml.j2 b/roles/bm_sno/templates/lvms_partition_machineconfig.yaml.j2 new file mode 100644 index 000000000..aab10e796 --- /dev/null +++ b/roles/bm_sno/templates/lvms_partition_machineconfig.yaml.j2 @@ -0,0 +1,18 @@ +--- +apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: master + name: 98-lvms-partition +spec: + config: + ignition: + version: 3.4.0 + storage: + disks: + - device: "{{ _lvms.device }}" + partitions: + - label: "{{ _lvms.label | default('lvmstorage') }}" + startMiB: {{ _lvms.rootfs_mib | default(150000) | int }} + sizeMiB: {{ _lvms.size_mib | default(0) | int }}