diff --git a/aci-preupgrade-validation-script.py b/aci-preupgrade-validation-script.py index b969bf6..65a6b83 100644 --- a/aci-preupgrade-validation-script.py +++ b/aci-preupgrade-validation-script.py @@ -2805,7 +2805,7 @@ def overlapping_vlan_pools_check(**kwargs): epg_key = ':'.join([dn.group('tenant'), dn.group('ap'), dn.group('epg')]) port_keys = [] if not dn.group('aep'): - fex = dn.group('stfex') if dn.group('stfex') else dn.group('dyfex') + fex = dn.group('stfex') if dn.group('stfex') else dnF.group('dyfex') port = dn.group('stport') if dn.group('stport') else dn.group('dyport') if fex: port_keys.append('/'.join([dn.group('node'), fex, port])) @@ -5303,7 +5303,7 @@ def cloudsec_encryption_depr_check(tversion, **kwargs): except OldVerClassNotFound: return Result(result=NA, msg="cversion does not have class cloudsecPreSharedKey") - if tversion.newer_than("6.0(6a)"): + if tversion.newer_than("6.0(6a)"): if len(cloudsecPreSharedKey) > 1: data.append(['Multiple CloudSec Encryption Keys found']) result = MANUAL @@ -5999,6 +5999,48 @@ def apic_vmm_inventory_sync_faults_check(**kwargs): recommended_action=recommended_action, doc_url=doc_url) + +@check_wrapper(check_title="Tacacs server unresponsive check") +def tacacs_server_unresponsive_check(fabric_nodes, tversion, username, password, **kwargs): + result = PASS + headers = ['APIC_Name', 'count'] + data = [] + recommended_action = "Contact Cisco TAC for Support before upgrade" + doc_url = "https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#tacacs-server-unresponsive-check" + + if not tversion: + return Result(result=MANUAL, msg=TVER_MISSING) + if tversion.older_than("6.1(4h)"): + controllers = [node for node in fabric_nodes if node['fabricNode']['attributes']['role'] == 'controller'] + if not controllers: + return Result(result=ERROR, msg="No fabricNode of APIC. Is the cluster healthy?", doc_url=doc_url) + has_error = False + for controller in controllers: + try: + connection = Connection(controller['fabricNode']['attributes']['address']) + connection.username = username + connection.password = password + connection.connect() + connection.cmd('cd /var/log/dme/log && zgrep -c "AAA server is unresponsive or too slow to respond" nginx.bin.log') + count = int(connection.output.strip()) + if(count > 0): + data.append([controller['fabricNode']['attributes']['name'], count]) + except Exception as e: + has_error = True + data.append([controller['fabricNode']['attributes']['name'], str(e)]) + + connection.close() + + if has_error: + result = ERROR + elif data: + result = FAIL_O + return Result(result=result,headers=headers,data=data,recommended_action=recommended_action,doc_url=doc_url) + else: + return Result(result=PASS, msg=VER_NOT_AFFECTED) + + + # ---- Script Execution ---- @@ -6160,6 +6202,7 @@ class CheckManager: standby_sup_sync_check, isis_database_byte_check, configpush_shard_check, + tacacs_server_unresponsive_check, ] ssh_checks = [ diff --git a/docs/docs/validations.md b/docs/docs/validations.md index fa1fc0e..7f47792 100644 --- a/docs/docs/validations.md +++ b/docs/docs/validations.md @@ -191,6 +191,7 @@ Items | Defect | This Script [Stale pconsRA Object][d26] | CSCwp22212 | :warning:{title="Deprecated"} | :no_entry_sign: [ISIS DTEPs Byte Size][d27] | CSCwp15375 | :white_check_mark: | :no_entry_sign: [Policydist configpushShardCont Crash][d28] | CSCwp95515 | :white_check_mark: | +[Tacacs server unresponsive check][d29] | CSCwo28197 | :white_check_mark: | [d1]: #ep-announce-compatibility [d2]: #eventmgr-db-size-defect-susceptibility @@ -220,6 +221,7 @@ Items | Defect | This Script [d26]: #stale-pconsra-object [d27]: #isis-dteps-byte-size [d28]: #policydist-configpushshardcont-crash +[d29]: #tacacs-server-unresponsive-check ## General Check Details @@ -2613,6 +2615,14 @@ Due to [CSCwp95515][59], upgrading to an affected version while having any `conf If any instances of `configpushShardCont` are flagged by this script, Cisco TAC must be contacted to identify and resolve the underlying issue before performing the upgrade. +### Tacacs server unresponsive check + +Due to [CSCwo28197][62], APIC TACACS authentication may fail with the error “AAA server is unresponsive or too slow to respond” caused by stalled AAA/PAM requests after prolonged uptime.The issue may temporarily clear after an APIC reboot or nginx restart. + +Administrators may be unable to log in to the APIC using TACACS authentication via GUI or SSH, potentially affecting access during operations or upgrades. + +Check APIC nginx logs for TACACS unresponsive errors on affected releases. If detected, contact Cisco TAC or upgrade to a release containing the fix for CSCwo28197 before proceeding with an upgrade. + [0]: https://github.com/datacenter/ACI-Pre-Upgrade-Validation-Script [1]: https://www.cisco.com/c/dam/en/us/td/docs/Website/datacenter/apicmatrix/index.html @@ -2676,3 +2686,4 @@ If any instances of `configpushShardCont` are flagged by this script, Cisco TAC [59]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwp95515 [60]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#Inter [61]: https://www.cisco.com/c/en/us/solutions/collateral/data-center-virtualization/application-centric-infrastructure/white-paper-c11-743951.html#EnablePolicyCompression +[62]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwo28197 diff --git a/tests/checks/tacacs_server_unresponsive_check/fabricNode.json b/tests/checks/tacacs_server_unresponsive_check/fabricNode.json new file mode 100644 index 0000000..962a4ad --- /dev/null +++ b/tests/checks/tacacs_server_unresponsive_check/fabricNode.json @@ -0,0 +1,93 @@ +[ + { + "fabricNode": { + "attributes": { + "address": "10.0.0.1", + "dn": "topology/pod-1/node-1", + "fabricSt": "commissioned", + "id": "1", + "model": "APIC-SERVER-L2", + "monPolDn": "uni/fabric/monfab-default", + "name": "apic1", + "nodeType": "unspecified", + "role": "controller" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.2", + "dn": "topology/pod-1/node-2", + "fabricSt": "commissioned", + "id": "2", + "model": "APIC-SERVER-L2", + "monPolDn": "uni/fabric/monfab-default", + "name": "apic2", + "nodeType": "unspecified", + "role": "controller" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.3", + "dn": "topology/pod-2/node-3", + "fabricSt": "commissioned", + "id": "3", + "model": "APIC-SERVER-L2", + "monPolDn": "uni/fabric/monfab-default", + "name": "apic3", + "nodeType": "unspecified", + "role": "controller" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.101", + "dn": "topology/pod-1/node-101", + "fabricSt": "active", + "id": "101", + "model": "N9K-C93180YC-FX", + "monPolDn": "uni/fabric/monfab-default", + "name": "leaf101", + "nodeType": "unspecified", + "role": "leaf" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.102", + "dn": "topology/pod-1/node-102", + "fabricSt": "active", + "id": "102", + "model": "N9K-C93180YC-FX", + "monPolDn": "uni/fabric/monfab-default", + "name": "leaf102", + "nodeType": "unspecified", + "role": "leaf" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.201", + "dn": "topology/pod-1/node-201", + "fabricSt": "active", + "id": "201", + "model": "N9K-C9504", + "monPolDn": "uni/fabric/monfab-default", + "name": "spine201", + "nodeType": "unspecified", + "role": "spine" + } + } + } +] + diff --git a/tests/checks/tacacs_server_unresponsive_check/fabricNode_noApic.json b/tests/checks/tacacs_server_unresponsive_check/fabricNode_noApic.json new file mode 100644 index 0000000..254f40d --- /dev/null +++ b/tests/checks/tacacs_server_unresponsive_check/fabricNode_noApic.json @@ -0,0 +1,48 @@ +[ + { + "fabricNode": { + "attributes": { + "address": "10.0.0.101", + "dn": "topology/pod-1/node-101", + "fabricSt": "active", + "id": "101", + "model": "N9K-C93180YC-FX", + "monPolDn": "uni/fabric/monfab-default", + "name": "leaf101", + "nodeType": "unspecified", + "role": "leaf" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.102", + "dn": "topology/pod-1/node-102", + "fabricSt": "active", + "id": "102", + "model": "N9K-C93180YC-FX", + "monPolDn": "uni/fabric/monfab-default", + "name": "leaf102", + "nodeType": "unspecified", + "role": "leaf" + } + } + }, + { + "fabricNode": { + "attributes": { + "address": "10.0.0.201", + "dn": "topology/pod-1/node-201", + "fabricSt": "active", + "id": "201", + "model": "N9K-C9504", + "monPolDn": "uni/fabric/monfab-default", + "name": "spine201", + "nodeType": "unspecified", + "role": "spine" + } + } + } +] + diff --git a/tests/checks/tacacs_server_unresponsive_check/test_tecacsUnresponsiveCheckScript.py b/tests/checks/tacacs_server_unresponsive_check/test_tecacsUnresponsiveCheckScript.py new file mode 100644 index 0000000..e9fb155 --- /dev/null +++ b/tests/checks/tacacs_server_unresponsive_check/test_tecacsUnresponsiveCheckScript.py @@ -0,0 +1,188 @@ +import os +import pytest +import logging +import importlib +from helpers.utils import read_data + +script = importlib.import_module("aci-preupgrade-validation-script") + +log = logging.getLogger(__name__) +dir = os.path.dirname(os.path.abspath(__file__)) + +test_function = "tacacs_server_unresponsive_check" + +fabricNodes = read_data(dir, "fabricNode.json") +controllers = [mo for mo in fabricNodes if mo["fabricNode"]["attributes"]["role"] == "controller"] + +grep_cmd = 'cd /var/log/dme/log && zgrep -c "AAA server is unresponsive or too slow to respond" nginx.bin.log' + +grep_output_zero = "0" +grep_output_with_events_apic1 = "150" +grep_output_with_events_apic2 = "200" +grep_output_with_events_apic3 = "75" + + +@pytest.mark.parametrize( + "icurl_outputs, fabric_nodes, conn_failure, conn_cmds, tversion, expected_result", + [ + # NO TVERSION PROVIDED - MANUAL CHECK + ( + {}, + fabricNodes, + False, + {}, + None, + script.MANUAL, + ), + # TVERSION >= 6.1(4h) - Not Affected + ( + {}, + fabricNodes, + False, + {}, + "6.1(4h)", + script.PASS, + ), + # No fabricNode for APICs + ( + {}, + read_data(dir, "fabricNode_noApic.json"), + False, + {}, + "6.1(3a)", + script.ERROR, + ), + # Exception failure at grep command on all APICs + ( + {}, + fabricNodes, + False, + { + controller["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "", + "exception": Exception("Simulated exception at grep command"), + } + ] + for controller in controllers + }, + "6.1(3a)", + script.ERROR, + ), + # TVERSION < 6.1(4h) and no TACACS events found (count = 0 on all APICs) + ( + {}, + fabricNodes, + False, + { + controller['fabricNode']['attributes']['address']: [ + { + "cmd": grep_cmd, + "output": "0\n", + "exception": None, + } + ] + for controller in controllers + }, + "5.2(1a)", + script.PASS, + ), + # TVERSION < 6.1(4h) and TACACS events found on all APICs + ( + {}, + fabricNodes, + False, + { + controllers[0]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "150\n", + "exception": None, + } + ], + controllers[1]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "200\n", + "exception": None, + } + ], + controllers[2]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "75\n", + "exception": None, + } + ], + }, + "6.1(3a)", + script.FAIL_O, + ), + # Mixed scenario: Some APICs have events, some don't + ( + {}, + fabricNodes, + False, + { + controllers[0]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "150\n", + "exception": None, + } + ], + controllers[1]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "0\n", + "exception": None, + } + ], + controllers[2]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "75\n", + "exception": None, + } + ], + }, + "6.0(2a)", + script.FAIL_O, + ), + # Mixed scenario: Connection failure on one APIC, success with events on others + ( + {}, + fabricNodes, + False, + { + controllers[0]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "150\n", + "exception": None, + } + ], + controllers[1]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "", + "exception": Exception("SSH timeout"), + } + ], + controllers[2]["fabricNode"]["attributes"]["address"]: [ + { + "cmd": grep_cmd, + "output": "0\n", + "exception": None, + } + ], + }, + "6.1(2a)", + script.ERROR, + ), + ], +) +def test_logic( run_check,mock_icurl,mock_conn,icurl_outputs,fabric_nodes,tversion,expected_result): + result = run_check(fabric_nodes=fabric_nodes,tversion=script.AciVersion(tversion) if tversion else None,username="admin",password="password") + assert result.result == expected_result