diff --git a/aci-preupgrade-validation-script.py b/aci-preupgrade-validation-script.py index ebe0477..a1b38a2 100644 --- a/aci-preupgrade-validation-script.py +++ b/aci-preupgrade-validation-script.py @@ -6007,6 +6007,92 @@ def apic_vmm_inventory_sync_faults_check(**kwargs): recommended_action=recommended_action, doc_url=doc_url) +@check_wrapper(check_title = 'Bootx Service failure checks') +def bootx_service_failure_checks(fabric_nodes, cversion, username, password, **kwargs): + result = PASS + headers = ["Node", "File Count", "Fatal Errors Found", "Status"] + data = [] + recommended_action = 'Contact Cisco TAC to investigate all flagged high file and log counts' + doc_url = 'https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#bootx_service_failure_log_and_firmware_tmp_directory_checks' + + if not cversion: + return Result(result=MANUAL, msg="Current version not provided") + + if not fabric_nodes: + return Result(result=ERROR, msg="Fabric node response empty. Is the cluster healthy?", doc_url=doc_url) + + if (not cversion.older_than("6.0(2h)") and not cversion.newer_than("6.0(8h)")) or \ + (not cversion.older_than("6.1(1f)") and not cversion.newer_than("6.1(2g)")): + + # Filter for controller nodes only + controller = [node for node in fabric_nodes if node['fabricNode']['attributes']['role'] == 'controller'] + if not controller: + return Result(result=ERROR, msg="No controller nodes found. Is the cluster healthy?", doc_url=doc_url) + + checked_apics = {} + has_error = False + + for apic in controller: + attr = apic['fabricNode']['attributes'] + if attr['address'] in checked_apics: + continue + checked_apics[attr['address']] = 1 + node_id = attr['id'] + + try: + c = Connection(attr['address']) + c.username = username + c.password = password + c.log = LOG_FILE + c.connect() + except Exception as e: + data.append([node_id, '-', '-', 'ERROR: %s' % str(e)]) + has_error = True + continue + + try: + # Check if /firmware/tmp directory exists and count files + c.cmd('[ -d /firmware/tmp ] && ls -1 /firmware/tmp 2>/dev/null | wc -l || echo 0') + file_count = 0 + for line in c.output.strip().split('\n'): + line = line.strip() + if line.isdigit(): + file_count = int(line) + break + + # Check for fatal errors in bootx logs + c.cmd('[ -d /var/log/bootx/logs ] && grep -Ri "fatal" /var/log/bootx/logs/* 2>/dev/null | wc -l || echo 0') + fatal_count = 0 + for line in c.output.strip().split('\n'): + line = line.strip() + if line.isdigit(): + fatal_count = int(line) + break + + # Determine status + if file_count >= 1000: + status = 'FAIL - High file count' + data.append([node_id, str(file_count),"-", status]) + result = FAIL_UF + + if fatal_count > 0: + status = 'FAIL - Fatal errors found' + data.append([node_id, "-", str(fatal_count), status]) + result = FAIL_UF + + except Exception as e: + data.append([node_id, '-', '-', 'ERROR: %s' % str(e)]) + has_error = True + continue + c.close() + if has_error and result == PASS: + result = ERROR + else: + return Result(result=PASS, msg=VER_NOT_AFFECTED) + + return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url) + + # ---- Script Execution ---- @@ -6094,6 +6180,7 @@ class CheckManager: post_upgrade_cb_check, validate_32_64_bit_image_check, fabric_link_redundancy_check, + bootx_service_failure_checks, # Faults apic_disk_space_faults_check, diff --git a/docs/docs/validations.md b/docs/docs/validations.md index fa1fc0e..c2b343d 100644 --- a/docs/docs/validations.md +++ b/docs/docs/validations.md @@ -191,6 +191,7 @@ Items | Defect | This Script [Stale pconsRA Object][d26] | CSCwp22212 | :warning:{title="Deprecated"} | :no_entry_sign: [ISIS DTEPs Byte Size][d27] | CSCwp15375 | :white_check_mark: | :no_entry_sign: [Policydist configpushShardCont Crash][d28] | CSCwp95515 | :white_check_mark: | +[Bootx Service failure checks][d29] | CSCwn37676 | :white_check_mark: | :no_entry_sign: [d1]: #ep-announce-compatibility [d2]: #eventmgr-db-size-defect-susceptibility @@ -220,6 +221,7 @@ Items | Defect | This Script [d26]: #stale-pconsra-object [d27]: #isis-dteps-byte-size [d28]: #policydist-configpushshardcont-crash +[d29]: #bootx-service-failure-checks ## General Check Details @@ -2614,6 +2616,21 @@ Due to [CSCwp95515][59], upgrading to an affected version while having any `conf If any instances of `configpushShardCont` are flagged by this script, Cisco TAC must be contacted to identify and resolve the underlying issue before performing the upgrade. +### Bootx Service failure checks + +Due to [CSCwn37676][62], ACI runs on releases 6.0(2h) through 6.0(8h) or 6.1(1f) through 6.1(2g) , upgrading to any target version with a high number of files in the `/firmware/tmp/` directory (1000 or more) or the presence of fatal errors in `/var/log/bootx/logs/` can cause the bootx service to fail, resulting in upgrade failures. + +The script performs two validations on each APIC: + +1. Checks if `/firmware/tmp/` directory contains 1000 or more files +2. Searches for "fatal" errors in `/var/log/bootx/logs/` + +!!! warning + If this check fails, verify the bootx service status on the affected APIC(s) by running `systemctl status bootx`. If the service is not running, the APIC is already experiencing the issue and must be resolved before proceeding with the upgrade. + +!!! tip + Certain high churn logging configurations have been found to cause excessive files in `/firmware/tmp/while on non-fixed versions. If this check identifies issues, work with Cisco TAC to clean up excess files and resolve any bootx service failures before attempting the upgrade. + [0]: https://github.com/datacenter/ACI-Pre-Upgrade-Validation-Script [1]: https://www.cisco.com/c/dam/en/us/td/docs/Website/datacenter/apicmatrix/index.html [2]: https://www.cisco.com/c/en/us/support/switches/nexus-9000-series-switches/products-release-notes-list.html @@ -2676,3 +2693,4 @@ If any instances of `configpushShardCont` are flagged by this script, Cisco TAC [59]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwp95515 [60]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#Inter [61]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#EnablePolicyCompression +[62]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwn37676 \ No newline at end of file diff --git a/tests/checks/bootx_service_failure_checks/fabricNode.json b/tests/checks/bootx_service_failure_checks/fabricNode.json new file mode 100644 index 0000000..d102b18 --- /dev/null +++ b/tests/checks/bootx_service_failure_checks/fabricNode.json @@ -0,0 +1,50 @@ +[ + { + "fabricNode": { + "attributes": { + "address": "10.0.0.1", + "dn": "topology/pod-1/node-1", + "fabricSt": "commissioned", + "id": "1", + "model": "APIC-SERVER-L2", + "monPolDn": "uni/fabric/monfab-default", + "name": "apic1", + "nodeType": "unspecified", + "podId": "1", + "role": "controller" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.2", + "dn": "topology/pod-1/node-2", + "fabricSt": "commissioned", + "id": "2", + "model": "APIC-SERVER-L2", + "monPolDn": "uni/fabric/monfab-default", + "name": "apic2", + "nodeType": "unspecified", + "podId": "1", + "role": "controller" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.3", + "dn": "topology/pod-1/node-3", + "fabricSt": "commissioned", + "id": "3", + "model": "APIC-SERVER-L2", + "monPolDn": "uni/fabric/monfab-default", + "name": "apic3", + "nodeType": "unspecified", + "podId": "1", + "role": "controller" + } + } + } +] diff --git a/tests/checks/bootx_service_failure_checks/test_bootx_service_failure_checks.py b/tests/checks/bootx_service_failure_checks/test_bootx_service_failure_checks.py new file mode 100644 index 0000000..a8e7f4c --- /dev/null +++ b/tests/checks/bootx_service_failure_checks/test_bootx_service_failure_checks.py @@ -0,0 +1,304 @@ +import os +import pytest +import logging +import importlib +from helpers.utils import read_data + +script = importlib.import_module("aci-preupgrade-validation-script") + +log = logging.getLogger(__name__) +dir = os.path.dirname(os.path.abspath(__file__)) + +# API query for fabricNode (get_fabric_nodes() uses 'fabricNode.json' without filter) +fabricNode_api = 'fabricNode.json' + +# Commands that will be executed via SSH +ls_firmware_tmp_cmd = '[ -d /firmware/tmp ] && ls -1 /firmware/tmp 2>/dev/null | wc -l || echo 0' +grep_fatal_bootx_cmd = '[ -d /var/log/bootx/logs ] && grep -Ri "fatal" /var/log/bootx/logs/* 2>/dev/null | wc -l || echo 0' + +test_function = "bootx_service_failure_checks" + +@pytest.mark.parametrize( + "icurl_outputs, conn_cmds, cversion, expected_result", + [ + # Test 1: Version not provided (cversion is None) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + {}, + None, + script.MANUAL, + ), + # Test 2: Version not affected (below 6.0(2h)) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + {}, + "6.0(1a)", + script.PASS, + ), + # Test 3: Version not affected (above 6.0(8h)) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + {}, + "6.0(9a)", + script.PASS, + ), + # Test 4: Version not affected (between 6.0(8h) and 6.1(1f)) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + {}, + "6.0(9h)", + script.PASS, + ), + # Test 5: Version not affected (above 6.1(2g)) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + {}, + "6.1(3a)", + script.PASS, + ), + # Test 6: Version not affected 6.0(2f) (below 6.0(2h)), no issues found + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "0\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "0\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "0\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.0(2f)", + script.PASS, + ), + # Test 7: Affected version 6.0(5a) (within 6.0(2h) to 6.0(8h)), file count >= 1000 on one APIC + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "1500\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "100\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "50\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.0(5a)", + script.FAIL_UF, + ), + # Test 8: Affected version 6.0(8f) (within 6.0(2h) to 6.0(8h)), fatal errors found on one APIC + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "50\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "5\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "30\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "20\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.0(8f)", + script.FAIL_UF, + ), + # Test 9: Affected version 6.1(1f) (within 6.1(1f) to 6.1(2g)), both high file count and fatal errors + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "2000\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "10\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "500\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "100\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.1(1f)", + script.FAIL_UF, + ), + # Test 10: Affected version 6.1(2f) (within 6.1(1f) to 6.1(2g)), multiple APICs with issues + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "1200\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "1500\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "2\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "100\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.1(2f)", + script.FAIL_UF, + ), + # Test 11: Affected version 6.0(3a) (within 6.0(2h) to 6.0(8h)), file count exactly 1000 (boundary test) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "1000\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "100\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "50\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.0(3a)", + script.FAIL_UF, + ), + # Test 12: Affected version 6.0(4a) (within 6.0(2h) to 6.0(8h)), file count just below 1000 (boundary test) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "999\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "100\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "50\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.0(4a)", + script.PASS, + ), + # Test 13: Affected version 6.1(2a) (within 6.1(1f) to 6.1(2g)), only fatal errors (no high file count) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "10\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "20\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "3\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "15\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "7\napic3#", "exception": None}, + ], + }, + "6.1(2a)", + script.FAIL_UF, + ), + ], +) +def test_logic(run_check, mock_icurl, mock_conn, icurl_outputs, cversion, expected_result): + cver = script.AciVersion(cversion) if cversion else None + fabric_nodes = icurl_outputs.get(fabricNode_api, []) + result = run_check(fabric_nodes=fabric_nodes, cversion=cver, username="admin", password="password") + assert result.result == expected_result + + +@pytest.mark.parametrize( + "icurl_outputs, conn_cmds, conn_failure, cversion, expected_result", + [ + # Test 14: Affected version 6.0(5a) (within 6.0(2h) to 6.0(8h)), SSH connection failure on one APIC + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + {}, + True, + "6.0(5a)", + script.ERROR, + ), + # Test 15: Affected version 6.0(7a) (within 6.0(2h) to 6.0(8h)), SSH command execution error on one APIC + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "", "exception": Exception("Command failed")}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "100\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "50\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + False, + "6.0(7a)", + script.ERROR, + ), + ], +) +def test_connection_errors(run_check, mock_icurl, mock_conn, icurl_outputs, cversion, expected_result): + cver = script.AciVersion(cversion) if cversion else None + fabric_nodes = icurl_outputs.get(fabricNode_api, []) + result = run_check(fabric_nodes=fabric_nodes, cversion=cver, username="admin", password="password") + assert result.result == expected_result + + +@pytest.mark.parametrize( + "icurl_outputs, conn_cmds, cversion, expected_result", + [ + # Test 16: Affected version 6.0(5a) (within 6.0(2h) to 6.0(8h)), Empty fabricNode response (unhealthy cluster) + ( + {fabricNode_api: []}, + {}, + "6.0(5a)", + script.ERROR, + ), + # Test 17: Affected version 6.0(6a) (within 6.0(2h) to 6.0(8h)), Non-numeric output from commands (edge case) + ( + {fabricNode_api: read_data(dir, "fabricNode.json")}, + { + "10.0.0.1": [ + {"cmd": ls_firmware_tmp_cmd, "output": "error\napic1#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic1#", "exception": None}, + ], + "10.0.0.2": [ + {"cmd": ls_firmware_tmp_cmd, "output": "0\napic2#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "invalid\napic2#", "exception": None}, + ], + "10.0.0.3": [ + {"cmd": ls_firmware_tmp_cmd, "output": "50\napic3#", "exception": None}, + {"cmd": grep_fatal_bootx_cmd, "output": "0\napic3#", "exception": None}, + ], + }, + "6.0(6a)", + script.PASS, + ), + ], +) +def test_edge_cases(run_check, mock_icurl, mock_conn, icurl_outputs, cversion, expected_result): + cver = script.AciVersion(cversion) if cversion else None + fabric_nodes = icurl_outputs.get(fabricNode_api, []) + result = run_check(fabric_nodes=fabric_nodes, cversion=cver, username="admin", password="password") + assert result.result == expected_result \ No newline at end of file