Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions aci-preupgrade-validation-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6007,6 +6007,92 @@ def apic_vmm_inventory_sync_faults_check(**kwargs):
recommended_action=recommended_action,
doc_url=doc_url)

@check_wrapper(check_title = 'Bootx Service failure checks')
def bootx_service_failure_checks(fabric_nodes, cversion, username, password, **kwargs):
result = PASS
headers = ["Node", "File Count", "Fatal Errors Found", "Status"]
data = []
recommended_action = 'Contact Cisco TAC to investigate all flagged high file and log counts'
doc_url = 'https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#bootx_service_failure_log_and_firmware_tmp_directory_checks'

if not cversion:
return Result(result=MANUAL, msg="Current version not provided")

if not fabric_nodes:
return Result(result=ERROR, msg="Fabric node response empty. Is the cluster healthy?", doc_url=doc_url)

if (not cversion.older_than("6.0(2h)") and not cversion.newer_than("6.0(8h)")) or \
(not cversion.older_than("6.1(1f)") and not cversion.newer_than("6.1(2g)")):

# Filter for controller nodes only
controller = [node for node in fabric_nodes if node['fabricNode']['attributes']['role'] == 'controller']
if not controller:
return Result(result=ERROR, msg="No controller nodes found. Is the cluster healthy?", doc_url=doc_url)

checked_apics = {}
has_error = False

for apic in controller:
attr = apic['fabricNode']['attributes']
if attr['address'] in checked_apics:
continue
checked_apics[attr['address']] = 1
node_id = attr['id']

try:
c = Connection(attr['address'])
c.username = username
c.password = password
c.log = LOG_FILE
c.connect()
except Exception as e:
data.append([node_id, '-', '-', 'ERROR: %s' % str(e)])
has_error = True
continue

try:
# Check if /firmware/tmp directory exists and count files
c.cmd('[ -d /firmware/tmp ] && ls -1 /firmware/tmp 2>/dev/null | wc -l || echo 0')
file_count = 0
for line in c.output.strip().split('\n'):
line = line.strip()
if line.isdigit():
file_count = int(line)
break

# Check for fatal errors in bootx logs
c.cmd('[ -d /var/log/bootx/logs ] && grep -Ri "fatal" /var/log/bootx/logs/* 2>/dev/null | wc -l || echo 0')
fatal_count = 0
for line in c.output.strip().split('\n'):
line = line.strip()
if line.isdigit():
fatal_count = int(line)
break

# Determine status
if file_count >= 1000:
status = 'FAIL - High file count'
data.append([node_id, str(file_count),"-", status])
result = FAIL_UF

if fatal_count > 0:
status = 'FAIL - Fatal errors found'
data.append([node_id, "-", str(fatal_count), status])
result = FAIL_UF

except Exception as e:
data.append([node_id, '-', '-', 'ERROR: %s' % str(e)])
has_error = True
continue
c.close()
if has_error and result == PASS:
result = ERROR
else:
return Result(result=PASS, msg=VER_NOT_AFFECTED)

return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url)


# ---- Script Execution ----


Expand Down Expand Up @@ -6094,6 +6180,7 @@ class CheckManager:
post_upgrade_cb_check,
validate_32_64_bit_image_check,
fabric_link_redundancy_check,
bootx_service_failure_checks,

# Faults
apic_disk_space_faults_check,
Expand Down
18 changes: 18 additions & 0 deletions docs/docs/validations.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ Items | Defect | This Script
[Stale pconsRA Object][d26] | CSCwp22212 | :warning:{title="Deprecated"} | :no_entry_sign:
[ISIS DTEPs Byte Size][d27] | CSCwp15375 | :white_check_mark: | :no_entry_sign:
[Policydist configpushShardCont Crash][d28] | CSCwp95515 | :white_check_mark: |
[Bootx Service failure checks][d29] | CSCwn37676 | :white_check_mark: | :no_entry_sign:

[d1]: #ep-announce-compatibility
[d2]: #eventmgr-db-size-defect-susceptibility
Expand Down Expand Up @@ -220,6 +221,7 @@ Items | Defect | This Script
[d26]: #stale-pconsra-object
[d27]: #isis-dteps-byte-size
[d28]: #policydist-configpushshardcont-crash
[d29]: #bootx-service-failure-checks


## General Check Details
Expand Down Expand Up @@ -2614,6 +2616,21 @@ Due to [CSCwp95515][59], upgrading to an affected version while having any `conf
If any instances of `configpushShardCont` are flagged by this script, Cisco TAC must be contacted to identify and resolve the underlying issue before performing the upgrade.


### Bootx Service failure checks

Due to [CSCwn37676][62], ACI runs on releases 6.0(2h) through 6.0(8h) or 6.1(1f) through 6.1(2g) , upgrading to any target version with a high number of files in the `/firmware/tmp/` directory (1000 or more) or the presence of fatal errors in `/var/log/bootx/logs/` can cause the bootx service to fail, resulting in upgrade failures.

The script performs two validations on each APIC:

1. Checks if `/firmware/tmp/` directory contains 1000 or more files
2. Searches for "fatal" errors in `/var/log/bootx/logs/`

!!! warning
If this check fails, verify the bootx service status on the affected APIC(s) by running `systemctl status bootx`. If the service is not running, the APIC is already experiencing the issue and must be resolved before proceeding with the upgrade.

!!! tip
Certain high churn logging configurations have been found to cause excessive files in `/firmware/tmp/while on non-fixed versions. If this check identifies issues, work with Cisco TAC to clean up excess files and resolve any bootx service failures before attempting the upgrade.

[0]: https://github.com/datacenter/ACI-Pre-Upgrade-Validation-Script
[1]: https://www.cisco.com/c/dam/en/us/td/docs/Website/datacenter/apicmatrix/index.html
[2]: https://www.cisco.com/c/en/us/support/switches/nexus-9000-series-switches/products-release-notes-list.html
Expand Down Expand Up @@ -2676,3 +2693,4 @@ If any instances of `configpushShardCont` are flagged by this script, Cisco TAC
[59]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwp95515
[60]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#Inter
[61]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#EnablePolicyCompression
[62]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwn37676
50 changes: 50 additions & 0 deletions tests/checks/bootx_service_failure_checks/fabricNode.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
[
{
"fabricNode": {
"attributes": {
"address": "10.0.0.1",
"dn": "topology/pod-1/node-1",
"fabricSt": "commissioned",
"id": "1",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic1",
"nodeType": "unspecified",
"podId": "1",
"role": "controller"
}
}
},
{
"fabricNode": {
"attributes": {
"address": "10.0.0.2",
"dn": "topology/pod-1/node-2",
"fabricSt": "commissioned",
"id": "2",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic2",
"nodeType": "unspecified",
"podId": "1",
"role": "controller"
}
}
},
{
"fabricNode": {
"attributes": {
"address": "10.0.0.3",
"dn": "topology/pod-1/node-3",
"fabricSt": "commissioned",
"id": "3",
"model": "APIC-SERVER-L2",
"monPolDn": "uni/fabric/monfab-default",
"name": "apic3",
"nodeType": "unspecified",
"podId": "1",
"role": "controller"
}
}
}
]
Loading