From e19b8fa00b8a25b4f9a1b7f537ee9783e1ecdbae Mon Sep 17 00:00:00 2001 From: jaspals Date: Fri, 20 Feb 2026 16:29:12 -0600 Subject: [PATCH 1/6] initial commit --- nodescraper/plugins/inband/rdma/__init__.py | 28 + .../plugins/inband/rdma/rdma_analyzer.py | 183 ++++ .../plugins/inband/rdma/rdma_collector.py | 183 ++++ .../plugins/inband/rdma/rdma_plugin.py | 38 + nodescraper/plugins/inband/rdma/rdmadata.py | 77 ++ .../fixtures/rdma_plugin_config.json | 1 + test/functional/test_plugin_configs.py | 1 + .../fixtures/rdma_link_example_data.json | 38 + .../fixtures/rdma_statistic_example_data.json | 826 ++++++++++++++++++ test/unit/plugin/test_rdma_analyzer.py | 272 ++++++ test/unit/plugin/test_rdma_collector.py | 101 +++ 11 files changed, 1748 insertions(+) create mode 100644 nodescraper/plugins/inband/rdma/__init__.py create mode 100644 nodescraper/plugins/inband/rdma/rdma_analyzer.py create mode 100644 nodescraper/plugins/inband/rdma/rdma_collector.py create mode 100644 nodescraper/plugins/inband/rdma/rdma_plugin.py create mode 100644 nodescraper/plugins/inband/rdma/rdmadata.py create mode 100644 test/functional/fixtures/rdma_plugin_config.json create mode 100644 test/unit/plugin/fixtures/rdma_link_example_data.json create mode 100644 test/unit/plugin/fixtures/rdma_statistic_example_data.json create mode 100644 test/unit/plugin/test_rdma_analyzer.py create mode 100644 test/unit/plugin/test_rdma_collector.py diff --git a/nodescraper/plugins/inband/rdma/__init__.py b/nodescraper/plugins/inband/rdma/__init__.py new file mode 100644 index 00000000..733dad59 --- /dev/null +++ b/nodescraper/plugins/inband/rdma/__init__.py @@ -0,0 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .rdma_plugin import RdmaPlugin + +__all__ = ["RdmaPlugin"] diff --git a/nodescraper/plugins/inband/rdma/rdma_analyzer.py b/nodescraper/plugins/inband/rdma/rdma_analyzer.py new file mode 100644 index 00000000..9d6068ef --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdma_analyzer.py @@ -0,0 +1,183 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult + +from .rdmadata import RdmaDataModel + + +class RdmaAnalyzer(DataAnalyzer[RdmaDataModel, None]): + """Check RDMA statistics for errors (RoCE and other RDMA error counters).""" + + DATA_MODEL = RdmaDataModel + + # Error fields checked from rdma statistic output (bnxt_re, mlx5, ionic, etc.) + ERROR_FIELDS = [ + "recoverable_errors", + "tx_roce_errors", + "tx_roce_discards", + "rx_roce_errors", + "rx_roce_discards", + "local_ack_timeout_err", + "packet_seq_err", + "max_retry_exceeded", + "rnr_nak_retry_err", + "implied_nak_seq_err", + "unrecoverable_err", + "bad_resp_err", + "local_qp_op_err", + "local_protection_err", + "mem_mgmt_op_err", + "req_remote_invalid_request", + "req_remote_access_errors", + "remote_op_err", + "duplicate_request", + "res_exceed_max", + "resp_local_length_error", + "res_exceeds_wqe", + "res_opcode_err", + "res_rx_invalid_rkey", + "res_rx_domain_err", + "res_rx_no_perm", + "res_rx_range_err", + "res_tx_invalid_rkey", + "res_tx_domain_err", + "res_tx_no_perm", + "res_tx_range_err", + "res_irrq_oflow", + "res_unsup_opcode", + "res_unaligned_atomic", + "res_rem_inv_err", + "res_mem_err", + "res_srq_err", + "res_cmp_err", + "res_invalid_dup_rkey", + "res_wqe_format_err", + "res_cq_load_err", + "res_srq_load_err", + "res_tx_pci_err", + "res_rx_pci_err", + "out_of_buffer", + "out_of_sequence", + "req_cqe_error", + "req_cqe_flush_error", + "resp_cqe_error", + "resp_cqe_flush_error", + "resp_remote_access_errors", + "req_rx_pkt_seq_err", + "req_rx_rnr_retry_err", + "req_rx_rmt_acc_err", + "req_rx_rmt_req_err", + "req_rx_oper_err", + "req_rx_impl_nak_seq_err", + "req_rx_cqe_err", + "req_rx_cqe_flush", + "req_rx_dup_response", + "req_rx_inval_pkts", + "req_tx_loc_acc_err", + "req_tx_loc_oper_err", + "req_tx_mem_mgmt_err", + "req_tx_retry_excd_err", + "req_tx_loc_sgl_inv_err", + "resp_rx_dup_request", + "resp_rx_outof_buf", + "resp_rx_outouf_seq", + "resp_rx_cqe_err", + "resp_rx_cqe_flush", + "resp_rx_loc_len_err", + "resp_rx_inval_request", + "resp_rx_loc_oper_err", + "resp_rx_outof_atomic", + "resp_tx_pkt_seq_err", + "resp_tx_rmt_inval_req_err", + "resp_tx_rmt_acc_err", + "resp_tx_rmt_oper_err", + "resp_tx_rnr_retry_err", + "resp_tx_loc_sgl_inv_err", + "resp_rx_s0_table_err", + "resp_rx_ccl_cts_outouf_seq", + "tx_rdma_ack_timeout", + "tx_rdma_ccl_cts_ack_timeout", + "rx_rdma_mtu_discard_pkts", + ] + + CRITICAL_ERROR_FIELDS = [ + "unrecoverable_err", + "res_tx_pci_err", + "res_rx_pci_err", + "res_mem_err", + ] + + def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> TaskResult: + """Analyze RDMA statistics for non-zero error counters. + + Args: + data: RDMA data model with statistic_list (and optionally link_list). + args: Unused (analyzer has no configurable args). + + Returns: + TaskResult with status OK if no errors, ERROR if any error counter > 0. + """ + if not data.statistic_list: + self.result.message = "RDMA statistics list is empty" + self.result.status = ExecutionStatus.NOT_RAN + return self.result + + error_state = False + for idx, stat in enumerate(data.statistic_list): + for error_field in self.ERROR_FIELDS: + value = getattr(stat, error_field, None) + if value is not None and value > 0: + priority = ( + EventPriority.CRITICAL + if error_field in self.CRITICAL_ERROR_FIELDS + else EventPriority.ERROR + ) + self._log_event( + category=EventCategory.IO, + description=f"RDMA error detected: {error_field}", + data={ + "interface": stat.ifname, + "port": stat.port, + "error_field": error_field, + "error_count": value, + "statistic_index": idx, + }, + priority=priority, + console_log=True, + ) + error_state = True + + if error_state: + self.result.message = "RDMA errors detected in statistics" + self.result.status = ExecutionStatus.ERROR + else: + self.result.message = "No RDMA errors detected in statistics" + self.result.status = ExecutionStatus.OK + return self.result diff --git a/nodescraper/plugins/inband/rdma/rdma_collector.py b/nodescraper/plugins/inband/rdma/rdma_collector.py new file mode 100644 index 00000000..b3e11ea6 --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdma_collector.py @@ -0,0 +1,183 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from typing import Optional + +from pydantic import ValidationError + +from nodescraper.base import InBandDataCollector +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily +from nodescraper.models import TaskResult +from nodescraper.utils import get_exception_traceback + +from .rdmadata import RdmaDataModel, RdmaLink, RdmaStatistics + + +class RdmaCollector(InBandDataCollector[RdmaDataModel, None]): + """Collect RDMA status and statistics via rdma link and rdma statistic commands.""" + + DATA_MODEL = RdmaDataModel + SUPPORTED_OS_FAMILY = {OSFamily.LINUX} + + def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: + """Run rdma command with JSON output. + + Args: + cmd: Subcommand (e.g. 'link' or 'statistic'), without 'rdma' prefix. + + Returns: + List of dicts from JSON output, or None on failure. + """ + full_cmd = f"rdma {cmd} -j" + res = self._run_sut_cmd(full_cmd) + + if res.exit_code != 0: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Error running rdma command: {full_cmd}", + data={ + "command": full_cmd, + "exit_code": res.exit_code, + "stderr": res.stderr, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + + if not res.stdout.strip(): + return [] + + try: + return json.loads(res.stdout) + except json.JSONDecodeError as e: + self._log_event( + category=EventCategory.APPLICATION, + description=f"Error parsing command: {full_cmd} json data", + data={ + "cmd": full_cmd, + "exception": get_exception_traceback(e), + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + + def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: + """Get RDMA statistics from 'rdma statistic -j'.""" + stat_data = self._run_rdma_command("statistic") + if stat_data is None: + return None + if not stat_data: + return [] + + try: + statistics = [] + for stat in stat_data: + if not isinstance(stat, dict): + self._log_event( + category=EventCategory.APPLICATION, + description="Invalid data type for RDMA statistic", + data={"data_type": type(stat).__name__}, + priority=EventPriority.WARNING, + ) + continue + statistics.append(RdmaStatistics(**stat)) + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build RdmaStatistics model", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return statistics + + def _get_rdma_link(self) -> Optional[list[RdmaLink]]: + """Get RDMA link data from 'rdma link -j'.""" + link_data = self._run_rdma_command("link") + if link_data is None: + return None + if not link_data: + return [] + + try: + links = [] + for link in link_data: + if not isinstance(link, dict): + self._log_event( + category=EventCategory.APPLICATION, + description="Invalid data type for RDMA link", + data={"data_type": type(link).__name__}, + priority=EventPriority.WARNING, + ) + continue + links.append(RdmaLink(**link)) + return links + except ValidationError as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build RdmaLink model", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return links + + def collect_data(self, args: None = None) -> tuple[TaskResult, Optional[RdmaDataModel]]: + """Collect RDMA statistics and link data. + + Returns: + Task result and RdmaDataModel, or None if both commands failed. + """ + try: + links = self._get_rdma_link() + statistics = self._get_rdma_statistics() + + if statistics is None and links is None: + self.result.status = ExecutionStatus.EXECUTION_FAILURE + self.result.message = "Failed to collect RDMA data" + return self.result, None + + rdma_data = RdmaDataModel( + statistic_list=statistics if statistics is not None else [], + link_list=links if links is not None else [], + ) + self.result.message = ( + f"Collected {len(rdma_data.statistic_list)} RDMA statistics, " + f"{len(rdma_data.link_list)} RDMA links" + ) + self.result.status = ExecutionStatus.OK + return self.result, rdma_data + + except Exception as e: + self._log_event( + category=EventCategory.APPLICATION, + description="Error running RDMA collector", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.status = ExecutionStatus.EXECUTION_FAILURE + return self.result, None diff --git a/nodescraper/plugins/inband/rdma/rdma_plugin.py b/nodescraper/plugins/inband/rdma/rdma_plugin.py new file mode 100644 index 00000000..ec3c0249 --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdma_plugin.py @@ -0,0 +1,38 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .rdma_analyzer import RdmaAnalyzer +from .rdma_collector import RdmaCollector +from .rdmadata import RdmaDataModel + + +class RdmaPlugin(InBandDataPlugin[RdmaDataModel, None, None]): + """Plugin for collection and analysis of RDMA statistics and link data.""" + + DATA_MODEL = RdmaDataModel + COLLECTOR = RdmaCollector + ANALYZER = RdmaAnalyzer diff --git a/nodescraper/plugins/inband/rdma/rdmadata.py b/nodescraper/plugins/inband/rdma/rdmadata.py new file mode 100644 index 00000000..e8354b82 --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdmadata.py @@ -0,0 +1,77 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field, model_validator +from typing_extensions import Self + +from nodescraper.models import DataModel + + +class RdmaStatistics(BaseModel): + """RDMA statistic entry from 'rdma statistic -j'.""" + + model_config = ConfigDict(extra="allow") + + ifname: Optional[str] = None + port: Optional[int] = None + + @model_validator(mode="after") + def validate_at_least_one_field(self) -> Self: + if not self.model_fields_set: + raise ValueError("At least one field must be set in RdmaStatistics") + return self + + +class RdmaLink(BaseModel): + """RDMA link entry from 'rdma link -j'.""" + + ifindex: Optional[int] = None + ifname: Optional[str] = None + port: Optional[int] = None + state: Optional[str] = None + physical_state: Optional[str] = None + netdev: Optional[str] = None + netdev_index: Optional[int] = None + + @model_validator(mode="after") + def validate_at_least_one_field(self) -> Self: + if not self.model_fields_set: + raise ValueError("At least one field must be set in RdmaLink") + return self + + +class RdmaDataModel(DataModel): + """ + Data model for RDMA (Remote Direct Memory Access) statistics and link information. + + Attributes: + statistic_list: List of RDMA statistics from 'rdma statistic -j'. + link_list: List of RDMA links from 'rdma link -j'. + """ + + link_list: list[RdmaLink] = Field(default_factory=list) + statistic_list: list[RdmaStatistics] = Field(default_factory=list) diff --git a/test/functional/fixtures/rdma_plugin_config.json b/test/functional/fixtures/rdma_plugin_config.json new file mode 100644 index 00000000..3ddd4207 --- /dev/null +++ b/test/functional/fixtures/rdma_plugin_config.json @@ -0,0 +1 @@ +{"global_args":{},"plugins":{"RdmaPlugin":{}},"result_collators":{},"name":"RdmaPlugin config","desc":"Config for testing RdmaPlugin"} diff --git a/test/functional/test_plugin_configs.py b/test/functional/test_plugin_configs.py index 7f4ea6ce..c5e93bf7 100644 --- a/test/functional/test_plugin_configs.py +++ b/test/functional/test_plugin_configs.py @@ -55,6 +55,7 @@ def plugin_config_files(fixtures_dir): "OsPlugin": fixtures_dir / "os_plugin_config.json", "PackagePlugin": fixtures_dir / "package_plugin_config.json", "ProcessPlugin": fixtures_dir / "process_plugin_config.json", + "RdmaPlugin": fixtures_dir / "rdma_plugin_config.json", "RocmPlugin": fixtures_dir / "rocm_plugin_config.json", "StoragePlugin": fixtures_dir / "storage_plugin_config.json", "SysctlPlugin": fixtures_dir / "sysctl_plugin_config.json", diff --git a/test/unit/plugin/fixtures/rdma_link_example_data.json b/test/unit/plugin/fixtures/rdma_link_example_data.json new file mode 100644 index 00000000..6c228a81 --- /dev/null +++ b/test/unit/plugin/fixtures/rdma_link_example_data.json @@ -0,0 +1,38 @@ +[ + { + "ifindex": 0, + "ifname": "ionic_0", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic8p1", + "netdev_index": 3 + }, + { + "ifindex": 1, + "ifname": "ionic_1", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic7p1", + "netdev_index": 6 + }, + { + "ifindex": 2, + "ifname": "ionic_2", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic5p1", + "netdev_index": 8 + }, + { + "ifindex": 3, + "ifname": "ionic_3", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic6p1", + "netdev_index": 9 + } +] diff --git a/test/unit/plugin/fixtures/rdma_statistic_example_data.json b/test/unit/plugin/fixtures/rdma_statistic_example_data.json new file mode 100644 index 00000000..e338e41a --- /dev/null +++ b/test/unit/plugin/fixtures/rdma_statistic_example_data.json @@ -0,0 +1,826 @@ +[ + { + "ifname": "bnxt_re0", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 12, + "watermark_ahs": 8, + "watermark_qps": 229, + "watermark_rc_qps": 220, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 305, + "watermark_mws": 0, + "rx_pkts": 3504998440, + "rx_bytes": 2966950848, + "tx_pkts": 2747190987, + "tx_bytes": 912073550, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3324056122, + "tx_read_resp": 3324056122, + "tx_write_req": 622240024, + "tx_send_req": 97500, + "rx_atomic_requests": 0, + "rx_read_requests": 3324056122, + "rx_read_resp": 3324056122, + "rx_write_requests": 626374468, + "rx_send_req": 97500, + "rx_good_pkts": 1401322762, + "rx_good_bytes": 2966950848, + "out_of_buffer": 0, + "np_cnp_sent": 2873487760, + "rp_cnp_handled": 2103675678, + "np_ecn_marked_roce_packets": 2873487760, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re1", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 14, + "watermark_ahs": 3, + "watermark_qps": 228, + "watermark_rc_qps": 219, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 287, + "watermark_mws": 0, + "rx_pkts": 1509751895, + "rx_bytes": 3099873130, + "tx_pkts": 692925073, + "tx_bytes": 2068663286, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3322387232, + "tx_read_resp": 3322387232, + "tx_write_req": 620621144, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 3322387232, + "rx_read_resp": 3322387232, + "rx_write_requests": 621181433, + "rx_send_req": 0, + "rx_good_pkts": 3507768689, + "rx_good_bytes": 3099873130, + "out_of_buffer": 0, + "np_cnp_sent": 1097578610, + "rp_cnp_handled": 2296950502, + "np_ecn_marked_roce_packets": 1097578610, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re2", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 4, + "watermark_qps": 230, + "watermark_rc_qps": 221, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 294, + "watermark_mws": 0, + "rx_pkts": 2328181128, + "rx_bytes": 79750872, + "tx_pkts": 1404869338, + "tx_bytes": 644434628, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3212760135, + "tx_read_resp": 3212760135, + "tx_write_req": 1995861174, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 3212760135, + "rx_read_resp": 3212760135, + "rx_write_requests": 1995579948, + "rx_send_req": 0, + "rx_good_pkts": 4025638368, + "rx_good_bytes": 79750872, + "out_of_buffer": 0, + "np_cnp_sent": 4174752904, + "rp_cnp_handled": 2597510056, + "np_ecn_marked_roce_packets": 4174752904, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re3", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 12, + "watermark_ahs": 7, + "watermark_qps": 229, + "watermark_rc_qps": 220, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 292, + "watermark_mws": 0, + "rx_pkts": 3888070733, + "rx_bytes": 3748987850, + "tx_pkts": 2265082996, + "tx_bytes": 3715380316, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3103369202, + "tx_read_resp": 3103369202, + "tx_write_req": 3370635080, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 3103369202, + "rx_read_resp": 3103369202, + "rx_write_requests": 3368547249, + "rx_send_req": 0, + "rx_good_pkts": 2688805201, + "rx_good_bytes": 3748987850, + "out_of_buffer": 0, + "np_cnp_sent": 134598312, + "rp_cnp_handled": 1199265532, + "np_ecn_marked_roce_packets": 134598312, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re4", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 12, + "watermark_ahs": 6, + "watermark_qps": 230, + "watermark_rc_qps": 221, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 302, + "watermark_mws": 0, + "rx_pkts": 986831570, + "rx_bytes": 1185181414, + "tx_pkts": 1975828812, + "tx_bytes": 2763928250, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2993618119, + "tx_read_resp": 2993618119, + "tx_write_req": 449606302, + "tx_send_req": 37687, + "rx_atomic_requests": 0, + "rx_read_requests": 2993618119, + "rx_read_resp": 2993618119, + "rx_write_requests": 448485514, + "rx_send_req": 37687, + "rx_good_pkts": 2876478595, + "rx_good_bytes": 1185181414, + "out_of_buffer": 0, + "np_cnp_sent": 3525492995, + "rp_cnp_handled": 2405320271, + "np_ecn_marked_roce_packets": 3525492995, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re5", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 7, + "watermark_qps": 228, + "watermark_rc_qps": 219, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 287, + "watermark_mws": 0, + "rx_pkts": 3602164391, + "rx_bytes": 515322372, + "tx_pkts": 3498885620, + "tx_bytes": 3601952844, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2883798845, + "tx_read_resp": 2883798845, + "tx_write_req": 1822414941, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 2883798845, + "rx_read_resp": 2883798845, + "rx_write_requests": 1819507161, + "rx_send_req": 0, + "rx_good_pkts": 1576292710, + "rx_good_bytes": 515322372, + "out_of_buffer": 0, + "np_cnp_sent": 4093842522, + "rp_cnp_handled": 2025871681, + "np_ecn_marked_roce_packets": 4093842522, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re6", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 7, + "watermark_qps": 230, + "watermark_rc_qps": 221, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 294, + "watermark_mws": 0, + "rx_pkts": 2577272275, + "rx_bytes": 2249875450, + "tx_pkts": 2452138468, + "tx_bytes": 700557582, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2775090592, + "tx_read_resp": 2775090592, + "tx_write_req": 3201764210, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 2775090592, + "rx_read_resp": 2775090592, + "rx_write_requests": 3201655162, + "rx_send_req": 0, + "rx_good_pkts": 1197866395, + "rx_good_bytes": 2249875450, + "out_of_buffer": 0, + "np_cnp_sent": 2401103251, + "rp_cnp_handled": 1379405880, + "np_ecn_marked_roce_packets": 2401103251, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re7", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 6, + "watermark_qps": 228, + "watermark_rc_qps": 219, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 287, + "watermark_mws": 0, + "rx_pkts": 1606921676, + "rx_bytes": 4007942950, + "tx_pkts": 1249198409, + "tx_bytes": 25134278, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2665758274, + "tx_read_resp": 2665758274, + "tx_write_req": 284646587, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 2665758274, + "rx_read_resp": 2665758274, + "rx_write_requests": 284542358, + "rx_send_req": 0, + "rx_good_pkts": 253070639, + "rx_good_bytes": 4007942950, + "out_of_buffer": 0, + "np_cnp_sent": 2670842510, + "rp_cnp_handled": 1353851037, + "np_ecn_marked_roce_packets": 2670842510, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + } +] diff --git a/test/unit/plugin/test_rdma_analyzer.py b/test/unit/plugin/test_rdma_analyzer.py new file mode 100644 index 00000000..c64cab08 --- /dev/null +++ b/test/unit/plugin/test_rdma_analyzer.py @@ -0,0 +1,272 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from pathlib import Path + +import pytest + +from nodescraper.enums import EventPriority, ExecutionStatus +from nodescraper.plugins.inband.rdma.rdma_analyzer import RdmaAnalyzer +from nodescraper.plugins.inband.rdma.rdmadata import ( + RdmaDataModel, + RdmaLink, + RdmaStatistics, +) + + +@pytest.fixture +def rdma_analyzer(system_info): + return RdmaAnalyzer(system_info) + + +@pytest.fixture +def plugin_fixtures_path(): + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def clean_rdma_model(plugin_fixtures_path): + """RDMA data with no errors (all counters zero).""" + path = plugin_fixtures_path / "rdma_statistic_example_data.json" + data = json.loads(path.read_text()) + stats = [RdmaStatistics(**s) for s in data] + return RdmaDataModel(statistic_list=stats) + + +@pytest.fixture +def clean_stats(plugin_fixtures_path): + """List of clean RdmaStatistics (no errors) for building models with links.""" + path = plugin_fixtures_path / "rdma_statistic_example_data.json" + data = json.loads(path.read_text()) + return [RdmaStatistics(**s) for s in data] + + +def test_no_errors_detected(rdma_analyzer, clean_rdma_model): + """Test with nominal data that has no errors.""" + result = rdma_analyzer.analyze_data(clean_rdma_model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + +def test_single_error_detected(rdma_analyzer, clean_rdma_model): + """Test with data containing a single error.""" + stats = list(clean_rdma_model.statistic_list) + stats[0].tx_roce_errors = 5 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "RDMA errors detected in statistics" in result.message + assert len(result.events) == 1 + assert result.events[0].description == "RDMA error detected: tx_roce_errors" + assert result.events[0].priority == EventPriority.ERROR + assert result.events[0].data["error_count"] == 5 + assert result.events[0].data["interface"] == "bnxt_re0" + + +def test_multiple_errors_detected(rdma_analyzer, clean_rdma_model): + """Test with data containing multiple errors.""" + stats = list(clean_rdma_model.statistic_list) + stats[0].tx_roce_errors = 10 + stats[0].rx_roce_errors = 3 + stats[1].packet_seq_err = 7 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "RDMA errors detected in statistics" in result.message + assert len(result.events) == 3 + for event in result.events: + assert event.priority == EventPriority.ERROR + + +def test_critical_error_detected(rdma_analyzer, clean_rdma_model): + """Test with data containing a critical error.""" + stats = list(clean_rdma_model.statistic_list) + stats[0].unrecoverable_err = 1 + stats[0].res_tx_pci_err = 2 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "RDMA errors detected in statistics" in result.message + assert len(result.events) == 2 + critical_events = [e for e in result.events if e.priority == EventPriority.CRITICAL] + assert len(critical_events) == 2 + + +def test_empty_statistics(rdma_analyzer): + """Test with empty statistics list.""" + model = RdmaDataModel(statistic_list=[], link_list=[]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.NOT_RAN + assert result.message == "RDMA statistics list is empty" + + +def test_multiple_interfaces_with_errors(rdma_analyzer, clean_rdma_model): + """Test with errors across multiple interfaces.""" + stats = list(clean_rdma_model.statistic_list) + stats[0].max_retry_exceeded = 15 + stats[2].local_ack_timeout_err = 8 + stats[4].out_of_buffer = 100 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 3 + interfaces = {event.data["interface"] for event in result.events} + assert len(interfaces) == 3 + + +def test_all_error_types(rdma_analyzer): + """Test that all error fields are properly detected.""" + stats = RdmaStatistics( + ifname="bnxt_re_test", + port=1, + recoverable_errors=1, + tx_roce_errors=1, + unrecoverable_err=1, + ) + model = RdmaDataModel(statistic_list=[stats]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 3 + critical_events = [e for e in result.events if e.data["error_field"] == "unrecoverable_err"] + assert len(critical_events) == 1 + assert critical_events[0].priority == EventPriority.CRITICAL + + +def test_zero_errors_are_ignored(rdma_analyzer): + """Test that zero-value errors are not reported.""" + stats = RdmaStatistics( + ifname="bnxt_re_test", + port=1, + tx_roce_errors=0, + rx_roce_errors=0, + unrecoverable_err=0, + ) + model = RdmaDataModel(statistic_list=[stats]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + +def test_rdma_link_all_active(rdma_analyzer, clean_stats): + """Test with RDMA links that are all active and up.""" + links = [ + RdmaLink( + ifindex=0, + ifname="ionic_0", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic0p1", + netdev_index=3, + ), + RdmaLink( + ifindex=1, + ifname="ionic_1", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic1p1", + netdev_index=4, + ), + ] + model = RdmaDataModel(statistic_list=clean_stats, link_list=links) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert result.message == "No RDMA errors detected in statistics" + assert len(result.events) == 0 + + +def test_rdma_link_down_detected(rdma_analyzer, clean_stats): + """Test with RDMA links that are down""" + links = [ + RdmaLink( + ifindex=0, + ifname="ionic_0", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic0p1", + netdev_index=3, + ), + RdmaLink( + ifindex=1, + ifname="ionic_1", + port=1, + state="DOWN", + physical_state="LINK_DOWN", + netdev="benic1p1", + netdev_index=4, + ), + ] + model = RdmaDataModel(statistic_list=clean_stats, link_list=links) + result = rdma_analyzer.analyze_data(model) + # Current implementation only checks statistics, not link state + assert result.status == ExecutionStatus.OK + + +def test_rdma_link_empty_list(rdma_analyzer, clean_stats): + """Test with empty RDMA link list.""" + model = RdmaDataModel(statistic_list=clean_stats, link_list=[]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert result.message == "No RDMA errors detected in statistics" + + +def test_rdma_link_multiple_interfaces(rdma_analyzer, clean_stats): + """Test with multiple RDMA interfaces with different link states.""" + links = [ + RdmaLink( + ifindex=0, + ifname="ionic_0", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic0p1", + netdev_index=3, + ), + RdmaLink( + ifindex=1, + ifname="ionic_1", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic1p1", + netdev_index=4, + ), + RdmaLink( + ifindex=2, + ifname="ionic_2", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic2p1", + netdev_index=5, + ), + ] + model = RdmaDataModel(statistic_list=clean_stats, link_list=links) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 diff --git a/test/unit/plugin/test_rdma_collector.py b/test/unit/plugin/test_rdma_collector.py new file mode 100644 index 00000000..a2508497 --- /dev/null +++ b/test/unit/plugin/test_rdma_collector.py @@ -0,0 +1,101 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from pathlib import Path + +import pytest + +from nodescraper.connection.inband.inband import CommandArtifact +from nodescraper.enums import ExecutionStatus, OSFamily +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.plugins.inband.rdma.rdma_collector import RdmaCollector +from nodescraper.plugins.inband.rdma.rdmadata import RdmaDataModel + + +@pytest.fixture +def collector(system_info, conn_mock): + return RdmaCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + + +@pytest.fixture +def rdma_statistic_output(): + path = Path(__file__).parent / "fixtures" / "rdma_statistic_example_data.json" + return path.read_text() + + +@pytest.fixture +def rdma_link_output(): + path = Path(__file__).parent / "fixtures" / "rdma_link_example_data.json" + return path.read_text() + + +def test_collect_success(collector, conn_mock, rdma_link_output, rdma_statistic_output): + """Successful collection returns RdmaDataModel with statistics and links (full fixtures).""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout=rdma_link_output, stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, stdout=rdma_statistic_output, stderr="", command="rdma statistic -j" + ), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert isinstance(data, RdmaDataModel) + # Full statistic fixture has 8 devices (bnxt_re0..bnxt_re7) with full stats + assert len(data.statistic_list) == 8 + assert data.statistic_list[0].ifname == "bnxt_re0" + # Full link fixture has 4 ionic links + assert len(data.link_list) == 4 + assert data.link_list[0].ifname == "ionic_0" + + +def test_collect_both_commands_fail(collector, conn_mock): + """When both rdma commands fail, status is EXECUTION_FAILURE and data is None.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.return_value = CommandArtifact( + exit_code=1, stdout="", stderr="rdma command failed", command="rdma link -j" + ) + res, data = collector.collect_data() + assert res.status == ExecutionStatus.EXECUTION_FAILURE + assert data is None + + +def test_collect_empty_output(collector, conn_mock): + """Empty JSON arrays yield empty lists in model.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma statistic -j"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert data.link_list == [] + assert data.statistic_list == [] From b0f0f96f1379b23a82a8ce6de459c1f1c881f19d Mon Sep 17 00:00:00 2001 From: jaspals Date: Mon, 23 Feb 2026 11:52:37 -0600 Subject: [PATCH 2/6] added intf name in log --- nodescraper/plugins/inband/rdma/rdma_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/rdma/rdma_analyzer.py b/nodescraper/plugins/inband/rdma/rdma_analyzer.py index 9d6068ef..e4006aaa 100644 --- a/nodescraper/plugins/inband/rdma/rdma_analyzer.py +++ b/nodescraper/plugins/inband/rdma/rdma_analyzer.py @@ -161,7 +161,7 @@ def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> Task ) self._log_event( category=EventCategory.IO, - description=f"RDMA error detected: {error_field}", + description=f"RDMA error detected on {stat.ifname}: {error_field}", data={ "interface": stat.ifname, "port": stat.port, From 0d75c909440acab7e2e3b2438fdfef970c8f4a53 Mon Sep 17 00:00:00 2001 From: jaspals Date: Mon, 23 Feb 2026 12:16:02 -0600 Subject: [PATCH 3/6] fixed log messages --- .../plugins/inband/rdma/rdma_analyzer.py | 41 ++++++++++--------- test/unit/plugin/test_rdma_analyzer.py | 33 +++++++++------ 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/rdma_analyzer.py b/nodescraper/plugins/inband/rdma/rdma_analyzer.py index e4006aaa..065b716d 100644 --- a/nodescraper/plugins/inband/rdma/rdma_analyzer.py +++ b/nodescraper/plugins/inband/rdma/rdma_analyzer.py @@ -151,28 +151,31 @@ def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> Task error_state = False for idx, stat in enumerate(data.statistic_list): + errors_on_interface = [] # (error_field, value, is_critical) for error_field in self.ERROR_FIELDS: value = getattr(stat, error_field, None) if value is not None and value > 0: - priority = ( - EventPriority.CRITICAL - if error_field in self.CRITICAL_ERROR_FIELDS - else EventPriority.ERROR - ) - self._log_event( - category=EventCategory.IO, - description=f"RDMA error detected on {stat.ifname}: {error_field}", - data={ - "interface": stat.ifname, - "port": stat.port, - "error_field": error_field, - "error_count": value, - "statistic_index": idx, - }, - priority=priority, - console_log=True, - ) - error_state = True + is_critical = error_field in self.CRITICAL_ERROR_FIELDS + errors_on_interface.append((error_field, value, is_critical)) + if errors_on_interface: + error_state = True + interface_label = stat.ifname or "unknown" + error_names = [e[0] for e in errors_on_interface] + any_critical = any(e[2] for e in errors_on_interface) + priority = EventPriority.CRITICAL if any_critical else EventPriority.ERROR + errors_data = {field: value for field, value, _ in errors_on_interface} + self._log_event( + category=EventCategory.IO, + description=f"RDMA error detected on {interface_label}: [{', '.join(error_names)}]", + data={ + "interface": stat.ifname, + "port": stat.port, + "errors": errors_data, + "statistic_index": idx, + }, + priority=priority, + console_log=True, + ) if error_state: self.result.message = "RDMA errors detected in statistics" diff --git a/test/unit/plugin/test_rdma_analyzer.py b/test/unit/plugin/test_rdma_analyzer.py index c64cab08..196d4c5d 100644 --- a/test/unit/plugin/test_rdma_analyzer.py +++ b/test/unit/plugin/test_rdma_analyzer.py @@ -80,14 +80,14 @@ def test_single_error_detected(rdma_analyzer, clean_rdma_model): assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message assert len(result.events) == 1 - assert result.events[0].description == "RDMA error detected: tx_roce_errors" + assert result.events[0].description == "RDMA error detected on bnxt_re0: [tx_roce_errors]" assert result.events[0].priority == EventPriority.ERROR - assert result.events[0].data["error_count"] == 5 + assert result.events[0].data["errors"] == {"tx_roce_errors": 5} assert result.events[0].data["interface"] == "bnxt_re0" def test_multiple_errors_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing multiple errors.""" + """Test with data containing multiple errors (grouped per interface).""" stats = list(clean_rdma_model.statistic_list) stats[0].tx_roce_errors = 10 stats[0].rx_roce_errors = 3 @@ -96,13 +96,15 @@ def test_multiple_errors_detected(rdma_analyzer, clean_rdma_model): result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message - assert len(result.events) == 3 + assert len(result.events) == 2 # one per interface for event in result.events: assert event.priority == EventPriority.ERROR + # Total 3 errors across 2 interfaces + assert sum(len(e.data["errors"]) for e in result.events) == 3 def test_critical_error_detected(rdma_analyzer, clean_rdma_model): - """Test with data containing a critical error.""" + """Test with data containing a critical error (grouped per interface).""" stats = list(clean_rdma_model.statistic_list) stats[0].unrecoverable_err = 1 stats[0].res_tx_pci_err = 2 @@ -110,9 +112,10 @@ def test_critical_error_detected(rdma_analyzer, clean_rdma_model): result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR assert "RDMA errors detected in statistics" in result.message - assert len(result.events) == 2 - critical_events = [e for e in result.events if e.priority == EventPriority.CRITICAL] - assert len(critical_events) == 2 + assert len(result.events) == 1 # one event per interface + assert result.events[0].priority == EventPriority.CRITICAL + assert "unrecoverable_err" in result.events[0].data["errors"] + assert "res_tx_pci_err" in result.events[0].data["errors"] def test_empty_statistics(rdma_analyzer): @@ -138,7 +141,7 @@ def test_multiple_interfaces_with_errors(rdma_analyzer, clean_rdma_model): def test_all_error_types(rdma_analyzer): - """Test that all error fields are properly detected.""" + """Test that all error fields are properly detected (grouped in one event).""" stats = RdmaStatistics( ifname="bnxt_re_test", port=1, @@ -149,10 +152,14 @@ def test_all_error_types(rdma_analyzer): model = RdmaDataModel(statistic_list=[stats]) result = rdma_analyzer.analyze_data(model) assert result.status == ExecutionStatus.ERROR - assert len(result.events) == 3 - critical_events = [e for e in result.events if e.data["error_field"] == "unrecoverable_err"] - assert len(critical_events) == 1 - assert critical_events[0].priority == EventPriority.CRITICAL + assert len(result.events) == 1 # one event per interface + assert "unrecoverable_err" in result.events[0].data["errors"] + assert result.events[0].priority == EventPriority.CRITICAL + assert set(result.events[0].data["errors"].keys()) == { + "recoverable_errors", + "tx_roce_errors", + "unrecoverable_err", + } def test_zero_errors_are_ignored(rdma_analyzer): From 1ab378307ccf0f91feb9e8b014d28463f92806cb Mon Sep 17 00:00:00 2001 From: jaspals Date: Mon, 23 Feb 2026 12:46:07 -0600 Subject: [PATCH 4/6] tests fix --- .../fixtures/rdma_plugin_config.json | 10 +- test/functional/test_plugin_configs.py | 1 + test/functional/test_rdma_plugin.py | 106 ++++++++++++++++++ .../test_reference_config_workflow.py | 1 + test/functional/test_run_plugins.py | 1 + 5 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 test/functional/test_rdma_plugin.py diff --git a/test/functional/fixtures/rdma_plugin_config.json b/test/functional/fixtures/rdma_plugin_config.json index 3ddd4207..f62214b3 100644 --- a/test/functional/fixtures/rdma_plugin_config.json +++ b/test/functional/fixtures/rdma_plugin_config.json @@ -1 +1,9 @@ -{"global_args":{},"plugins":{"RdmaPlugin":{}},"result_collators":{},"name":"RdmaPlugin config","desc":"Config for testing RdmaPlugin"} +{ + "global_args": {}, + "plugins": { + "RdmaPlugin": {} + }, + "result_collators": {}, + "name": "RdmaPlugin config", + "desc": "Config for testing RdmaPlugin" + } diff --git a/test/functional/test_plugin_configs.py b/test/functional/test_plugin_configs.py index c5e93bf7..a0d73aaa 100644 --- a/test/functional/test_plugin_configs.py +++ b/test/functional/test_plugin_configs.py @@ -118,6 +118,7 @@ def test_plugin_config_with_builtin_config(run_cli_command, tmp_path): "OsPlugin", "PackagePlugin", "ProcessPlugin", + "RdmaPlugin", "RocmPlugin", "StoragePlugin", "SysctlPlugin", diff --git a/test/functional/test_rdma_plugin.py b/test/functional/test_rdma_plugin.py new file mode 100644 index 00000000..fdac7ade --- /dev/null +++ b/test/functional/test_rdma_plugin.py @@ -0,0 +1,106 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for RdmaPlugin with --plugin-configs.""" + +from pathlib import Path + +import pytest + + +@pytest.fixture +def fixtures_dir(): + """Return path to fixtures directory.""" + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def rdma_config_file(fixtures_dir): + """Return path to RdmaPlugin config file.""" + return fixtures_dir / "rdma_plugin_config.json" + + +def test_rdma_plugin_with_basic_config(run_cli_command, rdma_config_file, tmp_path): + """Test RdmaPlugin using basic config file.""" + assert rdma_config_file.exists(), f"Config file not found: {rdma_config_file}" + + log_path = str(tmp_path / "logs_rdma_basic") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(rdma_config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + assert "rdmaplugin" in output.lower() or "rdma" in output.lower() + + +def test_rdma_plugin_with_run_plugins_subcommand(run_cli_command, tmp_path): + """Test RdmaPlugin using run-plugins subcommand.""" + log_path = str(tmp_path / "logs_rdma_subcommand") + result = run_cli_command(["--log-path", log_path, "run-plugins", "RdmaPlugin"], check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_rdma_plugin_with_passive_interaction(run_cli_command, rdma_config_file, tmp_path): + """Test RdmaPlugin with PASSIVE system interaction level.""" + log_path = str(tmp_path / "logs_rdma_passive") + result = run_cli_command( + [ + "--log-path", + log_path, + "--sys-interaction-level", + "PASSIVE", + "--plugin-configs", + str(rdma_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_rdma_plugin_skip_sudo(run_cli_command, rdma_config_file, tmp_path): + """Test RdmaPlugin with --skip-sudo flag.""" + log_path = str(tmp_path / "logs_rdma_no_sudo") + result = run_cli_command( + [ + "--log-path", + log_path, + "--skip-sudo", + "--plugin-configs", + str(rdma_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 diff --git a/test/functional/test_reference_config_workflow.py b/test/functional/test_reference_config_workflow.py index 44362149..784ae909 100644 --- a/test/functional/test_reference_config_workflow.py +++ b/test/functional/test_reference_config_workflow.py @@ -238,6 +238,7 @@ def test_reference_config_with_analysis_args(run_cli_command, tmp_path): "OsPlugin", "PackagePlugin", "ProcessPlugin", + "RdmaPlugin", "RocmPlugin", "SysctlPlugin", ] diff --git a/test/functional/test_run_plugins.py b/test/functional/test_run_plugins.py index c7f6c662..e819fcbc 100644 --- a/test/functional/test_run_plugins.py +++ b/test/functional/test_run_plugins.py @@ -62,6 +62,7 @@ def test_plugin_registry_has_plugins(all_plugins): "OsPlugin", "PackagePlugin", "ProcessPlugin", + "RdmaPlugin", "RocmPlugin", "StoragePlugin", "SysctlPlugin", From 28a9062da929c6a0d53033c3ef06e1baf64247b3 Mon Sep 17 00:00:00 2001 From: jaspals Date: Wed, 25 Feb 2026 10:49:35 -0600 Subject: [PATCH 5/6] review fixes --- nodescraper/plugins/inband/rdma/__init__.py | 2 +- .../plugins/inband/rdma/rdma_analyzer.py | 2 +- .../plugins/inband/rdma/rdma_collector.py | 36 ++++++++++--------- .../plugins/inband/rdma/rdma_plugin.py | 2 +- nodescraper/plugins/inband/rdma/rdmadata.py | 6 ++-- test/functional/test_rdma_plugin.py | 2 +- test/unit/plugin/test_rdma_analyzer.py | 2 +- test/unit/plugin/test_rdma_collector.py | 2 +- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/nodescraper/plugins/inband/rdma/__init__.py b/nodescraper/plugins/inband/rdma/__init__.py index 733dad59..5c7cc181 100644 --- a/nodescraper/plugins/inband/rdma/__init__.py +++ b/nodescraper/plugins/inband/rdma/__init__.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/nodescraper/plugins/inband/rdma/rdma_analyzer.py b/nodescraper/plugins/inband/rdma/rdma_analyzer.py index 065b716d..d7dd4a27 100644 --- a/nodescraper/plugins/inband/rdma/rdma_analyzer.py +++ b/nodescraper/plugins/inband/rdma/rdma_analyzer.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/nodescraper/plugins/inband/rdma/rdma_collector.py b/nodescraper/plugins/inband/rdma/rdma_collector.py index b3e11ea6..2be1547c 100644 --- a/nodescraper/plugins/inband/rdma/rdma_collector.py +++ b/nodescraper/plugins/inband/rdma/rdma_collector.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -42,24 +42,26 @@ class RdmaCollector(InBandDataCollector[RdmaDataModel, None]): DATA_MODEL = RdmaDataModel SUPPORTED_OS_FAMILY = {OSFamily.LINUX} + CMD_LINK = "rdma link -j" + CMD_STATISTIC = "rdma statistic -j" + def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: """Run rdma command with JSON output. Args: - cmd: Subcommand (e.g. 'link' or 'statistic'), without 'rdma' prefix. + cmd: Full command string (e.g. CMD_LINK or CMD_STATISTIC). Returns: List of dicts from JSON output, or None on failure. """ - full_cmd = f"rdma {cmd} -j" - res = self._run_sut_cmd(full_cmd) + res = self._run_sut_cmd(cmd) if res.exit_code != 0: self._log_event( - category=EventCategory.APPLICATION, - description=f"Error running rdma command: {full_cmd}", + category=EventCategory.NETWORK, + description=f"Error running rdma command: {cmd}", data={ - "command": full_cmd, + "command": cmd, "exit_code": res.exit_code, "stderr": res.stderr, }, @@ -75,10 +77,10 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: return json.loads(res.stdout) except json.JSONDecodeError as e: self._log_event( - category=EventCategory.APPLICATION, - description=f"Error parsing command: {full_cmd} json data", + category=EventCategory.NETWORK, + description=f"Error parsing command: {cmd} json data", data={ - "cmd": full_cmd, + "cmd": cmd, "exception": get_exception_traceback(e), }, priority=EventPriority.ERROR, @@ -88,7 +90,7 @@ def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: """Get RDMA statistics from 'rdma statistic -j'.""" - stat_data = self._run_rdma_command("statistic") + stat_data = self._run_rdma_command(self.CMD_STATISTIC) if stat_data is None: return None if not stat_data: @@ -99,7 +101,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: for stat in stat_data: if not isinstance(stat, dict): self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Invalid data type for RDMA statistic", data={"data_type": type(stat).__name__}, priority=EventPriority.WARNING, @@ -108,7 +110,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: statistics.append(RdmaStatistics(**stat)) except ValidationError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Failed to build RdmaStatistics model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, @@ -117,7 +119,7 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: def _get_rdma_link(self) -> Optional[list[RdmaLink]]: """Get RDMA link data from 'rdma link -j'.""" - link_data = self._run_rdma_command("link") + link_data = self._run_rdma_command(self.CMD_LINK) if link_data is None: return None if not link_data: @@ -128,7 +130,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: for link in link_data: if not isinstance(link, dict): self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Invalid data type for RDMA link", data={"data_type": type(link).__name__}, priority=EventPriority.WARNING, @@ -138,7 +140,7 @@ def _get_rdma_link(self) -> Optional[list[RdmaLink]]: return links except ValidationError as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Failed to build RdmaLink model", data={"exception": get_exception_traceback(e)}, priority=EventPriority.WARNING, @@ -173,7 +175,7 @@ def collect_data(self, args: None = None) -> tuple[TaskResult, Optional[RdmaData except Exception as e: self._log_event( - category=EventCategory.APPLICATION, + category=EventCategory.NETWORK, description="Error running RDMA collector", data={"exception": get_exception_traceback(e)}, priority=EventPriority.ERROR, diff --git a/nodescraper/plugins/inband/rdma/rdma_plugin.py b/nodescraper/plugins/inband/rdma/rdma_plugin.py index ec3c0249..fac85862 100644 --- a/nodescraper/plugins/inband/rdma/rdma_plugin.py +++ b/nodescraper/plugins/inband/rdma/rdma_plugin.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/nodescraper/plugins/inband/rdma/rdmadata.py b/nodescraper/plugins/inband/rdma/rdmadata.py index e8354b82..dc6b79fe 100644 --- a/nodescraper/plugins/inband/rdma/rdmadata.py +++ b/nodescraper/plugins/inband/rdma/rdmadata.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -25,7 +25,7 @@ ############################################################################### from typing import Optional -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, Field, model_validator from typing_extensions import Self from nodescraper.models import DataModel @@ -34,8 +34,6 @@ class RdmaStatistics(BaseModel): """RDMA statistic entry from 'rdma statistic -j'.""" - model_config = ConfigDict(extra="allow") - ifname: Optional[str] = None port: Optional[int] = None diff --git a/test/functional/test_rdma_plugin.py b/test/functional/test_rdma_plugin.py index fdac7ade..862de3b8 100644 --- a/test/functional/test_rdma_plugin.py +++ b/test/functional/test_rdma_plugin.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/test/unit/plugin/test_rdma_analyzer.py b/test/unit/plugin/test_rdma_analyzer.py index 196d4c5d..c7b1dfd8 100644 --- a/test/unit/plugin/test_rdma_analyzer.py +++ b/test/unit/plugin/test_rdma_analyzer.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/test/unit/plugin/test_rdma_collector.py b/test/unit/plugin/test_rdma_collector.py index a2508497..0343a588 100644 --- a/test/unit/plugin/test_rdma_collector.py +++ b/test/unit/plugin/test_rdma_collector.py @@ -2,7 +2,7 @@ # # MIT License # -# Copyright (c) 2025 Advanced Micro Devices, Inc. +# Copyright (c) 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From 79deddd290384192384f3d78dbc3e0607f4ee1a5 Mon Sep 17 00:00:00 2001 From: jaspals Date: Wed, 25 Feb 2026 13:42:50 -0600 Subject: [PATCH 6/6] data fix --- nodescraper/plugins/inband/rdma/rdmadata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/rdma/rdmadata.py b/nodescraper/plugins/inband/rdma/rdmadata.py index dc6b79fe..7b1c1a4a 100644 --- a/nodescraper/plugins/inband/rdma/rdmadata.py +++ b/nodescraper/plugins/inband/rdma/rdmadata.py @@ -25,7 +25,7 @@ ############################################################################### from typing import Optional -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Self from nodescraper.models import DataModel @@ -34,6 +34,8 @@ class RdmaStatistics(BaseModel): """RDMA statistic entry from 'rdma statistic -j'.""" + model_config = ConfigDict(extra="allow") + ifname: Optional[str] = None port: Optional[int] = None