Skip to content
Draft

DNM Dev #3982

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/dictionary/en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Marjanovic
Nemanja
NICs
NodeHealthCheck
PV
PyYAML
RHCOS
SNO
Expand Down Expand Up @@ -554,6 +555,7 @@ rnkhwaejdughvnuzsdz
ro
rolename
rootdevicehints
rootfs
rpms
rpmss
rsa
Expand Down Expand Up @@ -637,6 +639,7 @@ undercloud
unicast
unittest
unmanaged
unallocated
uoyt
uri
usermod
Expand Down
27 changes: 24 additions & 3 deletions hooks/playbooks/ceph.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@
gather_facts: false
become: true
tasks:
- name: Gather network facts for IP-to-host mapping
ansible.builtin.setup:
gather_subset:
- network
# jq is normally installed by cifmw_block_device role, but when cifmw_ceph_spec_data_devices
# is defined (indicating block devices are already present), the block device creation play
# is skipped. Install jq explicitly here to ensure it's available for ceph operations.
Expand Down Expand Up @@ -159,6 +163,9 @@
hostvars[_target]['ansible_ssh_private_key_file'] |
default(lookup('env', 'ANSIBLE_SSH_PRIVATE_KEY'))
}}
cifmw_block_device_thin_pool: "{{ cifmw_ceph_thin_pool | default('') }}"
cifmw_block_device_thin_lv_size: "{{ cifmw_ceph_thin_lv_size | default('50G') }}"
cifmw_block_device_thin_lv_name: "ceph_osd_{{ i }}"
cifmw_block_device_image_file: /var/lib/ceph-osd-{{ i }}.img
cifmw_block_device_loop: /dev/loop{{ i + 3 }}
cifmw_block_lv_name: ceph_lv{{ i }}
Expand All @@ -170,6 +177,17 @@
loop_var: i
loop: "{{ range(0, cifmw_num_osds_perhost|int) }}"

- name: Build data_devices for ceph spec from cifmw_block_device outputs
ansible.builtin.set_fact:
cifmw_ceph_spec_data_devices: |
data_devices:
paths:
{% for p in cifmw_block_device_paths %}
- {{ p }}
{% endfor %}
delegate_to: localhost
delegate_facts: true

- name: Build Ceph spec and conf from gathered IPs of the target inventory group
tags: spec
hosts: localhost
Expand All @@ -186,9 +204,12 @@
when:
- not cifmw_ceph_ipv6 | default(false)
ansible.builtin.set_fact:
ssh_network_range: 192.168.122.0/24
# storage_network_range: 172.18.0.0/24
storage_mgmt_network_range: 172.20.0.0/24
ssh_network_range: >-
{{ cifmw_ceph_ssh_network_range | default('192.168.122.0/24') }}
storage_network_range: >-
{{ cifmw_ceph_storage_network_range | default('172.18.0.0/24') }}
storage_mgmt_network_range: >-
{{ cifmw_ceph_storage_mgmt_network_range | default('172.20.0.0/24') }}
all_addresses: ansible_all_ipv4_addresses
ms_bind_ipv4: true
ms_bind_ipv6: false
Expand Down
92 changes: 92 additions & 0 deletions hooks/playbooks/fix_swift_endpoint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
---
# Workaround: fix Swift (RGW) endpoint IP and port in Keystone.
#
# The cifmw_cephadm configure_object.yml registers the endpoint using
# cifmw_cephadm_rgw_vip:8080, which assumes the ceph ingress service
# (haproxy+keepalived) is deployed. On single-host HCI (no ingress),
# the VIP is never bound and port 8080 never listens — RGW is directly
# reachable on the host's storage IP at port 8082.
#
# This hook discovers the actual RGW address and port from the running
# ceph cluster and updates the Keystone endpoints to match.
#
# FIXME(ci-framework): The proper fix belongs in
# cifmw_cephadm/tasks/configure_object.yml — it should detect whether
# ingress is deployed and choose VIP:8080 vs host_ip:8082 accordingly.
- name: Fix Swift endpoint to match actual RGW address
hosts: "{{ groups[cifmw_ceph_target | default('computes')] | first }}"
gather_facts: false
vars:
_target_group: "{{ cifmw_ceph_target | default('computes') }}"
_target: "{{ groups[_target_group] | default([]) | first }}"
ansible_ssh_private_key_file: >-
{{
hostvars[_target]['ansible_ssh_private_key_file'] |
default(lookup('env', 'ANSIBLE_SSH_PRIVATE_KEY'))
}}
tasks:
- name: Get RGW daemon endpoint from ceph
become: true
ansible.builtin.shell: |
set -euo pipefail
cephadm shell -- ceph orch ps --daemon-type rgw --format json 2>/dev/null
register: _rgw_ps

- name: Get ingress service status
become: true
ansible.builtin.shell: |
set -euo pipefail
cephadm shell -- ceph orch ls --service-type ingress --format json 2>/dev/null
register: _ingress_ls

- name: Set RGW endpoint facts
vars:
_rgw_daemons: "{{ _rgw_ps.stdout | from_json }}"
_ingress_services: "{{ _ingress_ls.stdout | from_json }}"
_has_ingress: >-
{{ _ingress_services | length > 0 and
(_ingress_services | first).status.running | default(0) | int > 0 }}
block:
- name: Determine endpoint from ingress VIP
when: _has_ingress | bool
ansible.builtin.set_fact:
_rgw_port: "{{ (_ingress_services | first).spec.frontend_port | default(8080) }}"
_rgw_ip: >-
{{ (_ingress_services | first).status.virtual_ip |
regex_replace('/.*$', '') }}

- name: Determine endpoint from RGW daemon
when: not (_has_ingress | bool)
ansible.builtin.set_fact:
_rgw_port: "{{ (_rgw_daemons | first).ports | first }}"
_rgw_ip: "{{ (_rgw_daemons | first).ip | default(ansible_host) }}"

- name: Update Swift endpoints in Keystone
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
delegate_to: localhost
block:
- name: Get current Swift endpoints
ansible.builtin.shell: |
set -euo pipefail
oc -n {{ cifmw_cephadm_ns | default('openstack') }} \
exec -t openstackclient -- \
openstack endpoint list --service object-store -f json
register: _swift_eps

- name: Update each Swift endpoint URL
vars:
_eps: "{{ _swift_eps.stdout | from_json }}"
_url_prefix: "http://{{ _rgw_ip }}:{{ _rgw_port }}"
ansible.builtin.shell: |
set -euo pipefail
oc -n {{ cifmw_cephadm_ns | default('openstack') }} \
exec -t openstackclient -- \
openstack endpoint set \
--url '{{ _url_prefix }}/swift/v1/AUTH_%(tenant_id)s' \
{{ item.ID }}
loop: "{{ _eps }}"
loop_control:
label: "{{ item.Interface }}"
when: >-
_url_prefix not in (item.URL | default(''))
44 changes: 43 additions & 1 deletion roles/bm_sno/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ provision IP via `/etc/hosts` entries managed by the role.
| `cifmw_bm_agent_core_password` | str | — | Set a `core` user password post-install via MachineConfig |
| `cifmw_bm_agent_live_debug` | bool | `false` | Patch the agent ISO with password, autologin, and systemd debug shell on `tty6` for discovery-phase console access (requires `cifmw_bm_agent_core_password`) |
| `cifmw_bm_agent_disabled_ifaces` | list | `[]` | Extra NIC names to disable IPv4/IPv6 on during agent-based install. Prevents overlapping-subnet validation failures when multiple NICs share a native VLAN (e.g. `[eno2]`). The interfaces stay link-up but get no IP address; post-install NNCP configures them. |
| `cifmw_bm_agent_lvms_partition` | dict | `{}` | When set, creates an Ignition partition at install time to cap CoreOS rootfs growth and leave unallocated space for the LVMS StorageClass. Keys: `device` (required, e.g. `/dev/nvme0n1`), `rootfs_mib` (default `150000`), `size_mib` (default `0` = rest of disk), `label` (default `lvmstorage`). See [LVMS partition](#lvms-partition). |

## Secrets management

Expand Down Expand Up @@ -119,7 +120,8 @@ If the binary already exists in the working directory it is reused.
2. Ensure `GenericUsbBoot` is enabled in BIOS (auto-enable with power cycle if allowed)
3. Power off the host
4. Generate SSH keys, template `install-config.yaml` and `agent-config.yaml`
5. Acquire `openshift-install` binary (see above) and run `openshift-install agent create image` to build the agent ISO
5. Optionally generate an LVMS partition MachineConfig into `openshift/` manifests
6. Acquire `openshift-install` binary (see above) and run `openshift-install agent create image` to build the agent ISO
6. Optionally patch the ISO for discovery-phase console access
7. Serve the ISO via a root podman httpd container (rootless podman cannot use privileged ports)
8. Eject any existing VirtualMedia, then insert the agent ISO
Expand Down Expand Up @@ -267,6 +269,46 @@ cifmw_bm_agent_core_password: changeme
cifmw_bm_agent_live_debug: true
```

## LVMS partition

By default CoreOS expands its rootfs partition to fill the entire disk
at first boot. To reserve space for the LVMS (Logical Volume Manager
Storage) StorageClass, set `cifmw_bm_agent_lvms_partition` with at
least the `device` key. The role injects a MachineConfig manifest into
the agent ISO that creates a labeled partition via Ignition — before
`growfs` runs — so CoreOS rootfs stops at `rootfs_mib` and the
remainder is available for LVMS.

```yaml
cifmw_bm_agent_lvms_partition:
device: /dev/disk/by-path/pci-0000:65:00.0-scsi-0:3:111:0
rootfs_mib: 150000 # ~150 GB for CoreOS (minimum 25000)
size_mib: 0 # 0 = rest of disk
label: lvmstorage # partition label
```

After OCP is installed, create an `LVMCluster` CR that targets the
partition by label:

```yaml
apiVersion: lvm.topolvm.io/v1alpha1
kind: LVMCluster
metadata:
name: lvmcluster
namespace: openshift-storage
spec:
storage:
deviceClasses:
- name: lvmstorage
deviceSelector:
paths:
- /dev/disk/by-partlabel/lvmstorage
thinPoolConfig:
name: thin-pool
overprovisionRatio: 10
sizePercent: 90
```

## References

* [ci-framework reproducer documentation](https://ci-framework.readthedocs.io/en/latest/roles/reproducer.html)
Expand Down
11 changes: 11 additions & 0 deletions roles/bm_sno/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,14 @@ cifmw_bm_agent_live_debug: false
cifmw_bm_agent_vmedia_uefi_path: ""
cifmw_bm_agent_enable_usb_boot: true
cifmw_bm_agent_disabled_ifaces: []

# LVMS partition: restrict CoreOS rootfs growth and leave
# unallocated space for the LVMS StorageClass.
# Set cifmw_bm_agent_lvms_partition to enable.
# Example:
# cifmw_bm_agent_lvms_partition:
# device: /dev/disk/by-path/pci-0000:65:00.0-scsi-0:3:111:0
# rootfs_mib: 150000 # rootfs capped at ~150 GB
# size_mib: 0 # 0 = rest of disk
# label: lvmstorage
cifmw_bm_agent_lvms_partition: {}
28 changes: 28 additions & 0 deletions roles/bm_sno/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,34 @@
regexp: '^pullSecret:'
line: "pullSecret: '<REDACTED>'"

- name: Create LVMS partition MachineConfig manifest
when: cifmw_bm_agent_lvms_partition | default({}) | length > 0
block:
- name: Validate LVMS partition device is set
ansible.builtin.assert:
that:
- cifmw_bm_agent_lvms_partition.device is defined
- cifmw_bm_agent_lvms_partition.device | length > 0
- (cifmw_bm_agent_lvms_partition.rootfs_mib | default(150000) | int) >= 25000
fail_msg: >-
cifmw_bm_agent_lvms_partition.device must be set
(e.g. /dev/nvme0n1, /dev/disk/by-path/...), and
its size must be at least 25000 MB.

- name: Create openshift manifests directory
ansible.builtin.file:
path: "{{ _work_dir }}/openshift"
state: directory
mode: "0755"

- name: Template LVMS partition MachineConfig
vars:
_lvms: "{{ cifmw_bm_agent_lvms_partition }}"
ansible.builtin.template:
src: lvms_partition_machineconfig.yaml.j2
dest: "{{ _work_dir }}/openshift/98-lvms-partition.yaml"
mode: "0644"

- name: Ensure nmstatectl is available for agent-config networkConfig validation
become: true
ansible.builtin.package:
Expand Down
18 changes: 18 additions & 0 deletions roles/bm_sno/templates/lvms_partition_machineconfig.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
---
apiVersion: machineconfiguration.openshift.io/v1
kind: MachineConfig
metadata:
labels:
machineconfiguration.openshift.io/role: master
name: 98-lvms-partition
spec:
config:
ignition:
version: 3.4.0
storage:
disks:
- device: "{{ _lvms.device }}"
partitions:
- label: "{{ _lvms.label | default('lvmstorage') }}"
startMiB: {{ _lvms.rootfs_mib | default(150000) | int }}
sizeMiB: {{ _lvms.size_mib | default(0) | int }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
# source: nova05epsilon/edpm-nodeset-values-post-ceph/values.yaml.j2
# Auto-populates ceph_conf from files written by ceph.yml hook.
# The ceph.yml post_stage_run hook (via cifmw_ceph_client role) writes
# Ceph config files to cifmw_ceph_client_fetch_dir (default /tmp/).
# This template reads those files and provides them as base64-encoded
# values under data.ceph_conf (DCN convention).
{% set _fetch_dir = cifmw_ceph_client_fetch_dir | default('/tmp') %}
{% set _cluster = cifmw_ceph_client_cluster | default('ceph') %}
{% set _conf_file = (_fetch_dir, _cluster ~ '.conf') | path_join %}
{% set _keyring_file = (_fetch_dir, _cluster ~ '.client.openstack.keyring') | path_join %}
data:
ceph_conf:
{{ _cluster }}.client.openstack.keyring: {{ lookup('file', _keyring_file, rstrip=False) | b64encode }}
{{ _cluster }}.conf: {{ lookup('file', _conf_file, rstrip=False) | b64encode }}
18 changes: 18 additions & 0 deletions roles/ci_local_storage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ If apply, please explain the privilege escalation done in this role.
* `cifmw_cls_create_ee_storage`: (Bool) Param to create ee_storage. Defaults to `false`.
* `cifmw_cls_namespace`: (String) The namespace where OCP resources will be installed. Defaults to `openstack`.
* `cifmw_cls_action`: (String) Action to perform, can be `create` or `clean`. Defaults to `create`.
* `cifmw_cls_oc_debug_fallback`: (Bool) Use `oc debug node/` to create PV directories on k8s nodes that are not reachable via SSH from the Ansible inventory. When enabled, the role computes which k8s nodes have no matching SSH-reachable inventory host and falls back to `oc debug` for those nodes. Applies to both create and cleanup. Defaults to `false`. Use it with an SNO BM setup.
* `cifmw_cls_storage_manifest`: (Dict) The storage manifest resource to be used to initiate storage class.

## Examples

### Standard (CRC / VM-based)
```YAML
- hosts: localhost
vars:
Expand All @@ -32,3 +35,18 @@ If apply, please explain the privilege escalation done in this role.
- ansible.builtin.include_role:
name: ci_local_storage
```

### Baremetal SNO
On bare-metal Single Node OpenShift the k8s node is typically not present
in the Ansible inventory for SSH access. Enable `cifmw_cls_oc_debug_fallback`
so the role uses `oc debug node/` to manage PV directories instead:
```YAML
- hosts: localhost
vars:
cifmw_openshift_kubeconfig: "{{ ansible_user_dir }}/.kube/kubeconfig"
cifmw_cls_pv_count: 20
cifmw_cls_oc_debug_fallback: true
tasks:
- ansible.builtin.include_role:
name: ci_local_storage
```
1 change: 1 addition & 0 deletions roles/ci_local_storage/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ cifmw_cls_storage_provisioner: cifmw
cifmw_cls_create_ee_storage: false
cifmw_cls_namespace: openstack
cifmw_cls_action: create
cifmw_cls_oc_debug_fallback: false

cifmw_cls_storage_manifest:
kind: StorageClass
Expand Down
Loading
Loading