diff --git a/nodescraper/plugins/inband/rdma/__init__.py b/nodescraper/plugins/inband/rdma/__init__.py new file mode 100644 index 00000000..5c7cc181 --- /dev/null +++ b/nodescraper/plugins/inband/rdma/__init__.py @@ -0,0 +1,28 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .rdma_plugin import RdmaPlugin + +__all__ = ["RdmaPlugin"] diff --git a/nodescraper/plugins/inband/rdma/rdma_analyzer.py b/nodescraper/plugins/inband/rdma/rdma_analyzer.py new file mode 100644 index 00000000..d7dd4a27 --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdma_analyzer.py @@ -0,0 +1,186 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult + +from .rdmadata import RdmaDataModel + + +class RdmaAnalyzer(DataAnalyzer[RdmaDataModel, None]): + """Check RDMA statistics for errors (RoCE and other RDMA error counters).""" + + DATA_MODEL = RdmaDataModel + + # Error fields checked from rdma statistic output (bnxt_re, mlx5, ionic, etc.) + ERROR_FIELDS = [ + "recoverable_errors", + "tx_roce_errors", + "tx_roce_discards", + "rx_roce_errors", + "rx_roce_discards", + "local_ack_timeout_err", + "packet_seq_err", + "max_retry_exceeded", + "rnr_nak_retry_err", + "implied_nak_seq_err", + "unrecoverable_err", + "bad_resp_err", + "local_qp_op_err", + "local_protection_err", + "mem_mgmt_op_err", + "req_remote_invalid_request", + "req_remote_access_errors", + "remote_op_err", + "duplicate_request", + "res_exceed_max", + "resp_local_length_error", + "res_exceeds_wqe", + "res_opcode_err", + "res_rx_invalid_rkey", + "res_rx_domain_err", + "res_rx_no_perm", + "res_rx_range_err", + "res_tx_invalid_rkey", + "res_tx_domain_err", + "res_tx_no_perm", + "res_tx_range_err", + "res_irrq_oflow", + "res_unsup_opcode", + "res_unaligned_atomic", + "res_rem_inv_err", + "res_mem_err", + "res_srq_err", + "res_cmp_err", + "res_invalid_dup_rkey", + "res_wqe_format_err", + "res_cq_load_err", + "res_srq_load_err", + "res_tx_pci_err", + "res_rx_pci_err", + "out_of_buffer", + "out_of_sequence", + "req_cqe_error", + "req_cqe_flush_error", + "resp_cqe_error", + "resp_cqe_flush_error", + "resp_remote_access_errors", + "req_rx_pkt_seq_err", + "req_rx_rnr_retry_err", + "req_rx_rmt_acc_err", + "req_rx_rmt_req_err", + "req_rx_oper_err", + "req_rx_impl_nak_seq_err", + "req_rx_cqe_err", + "req_rx_cqe_flush", + "req_rx_dup_response", + "req_rx_inval_pkts", + "req_tx_loc_acc_err", + "req_tx_loc_oper_err", + "req_tx_mem_mgmt_err", + "req_tx_retry_excd_err", + "req_tx_loc_sgl_inv_err", + "resp_rx_dup_request", + "resp_rx_outof_buf", + "resp_rx_outouf_seq", + "resp_rx_cqe_err", + "resp_rx_cqe_flush", + "resp_rx_loc_len_err", + "resp_rx_inval_request", + "resp_rx_loc_oper_err", + "resp_rx_outof_atomic", + "resp_tx_pkt_seq_err", + "resp_tx_rmt_inval_req_err", + "resp_tx_rmt_acc_err", + "resp_tx_rmt_oper_err", + "resp_tx_rnr_retry_err", + "resp_tx_loc_sgl_inv_err", + "resp_rx_s0_table_err", + "resp_rx_ccl_cts_outouf_seq", + "tx_rdma_ack_timeout", + "tx_rdma_ccl_cts_ack_timeout", + "rx_rdma_mtu_discard_pkts", + ] + + CRITICAL_ERROR_FIELDS = [ + "unrecoverable_err", + "res_tx_pci_err", + "res_rx_pci_err", + "res_mem_err", + ] + + def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> TaskResult: + """Analyze RDMA statistics for non-zero error counters. + + Args: + data: RDMA data model with statistic_list (and optionally link_list). + args: Unused (analyzer has no configurable args). + + Returns: + TaskResult with status OK if no errors, ERROR if any error counter > 0. + """ + if not data.statistic_list: + self.result.message = "RDMA statistics list is empty" + self.result.status = ExecutionStatus.NOT_RAN + return self.result + + error_state = False + for idx, stat in enumerate(data.statistic_list): + errors_on_interface = [] # (error_field, value, is_critical) + for error_field in self.ERROR_FIELDS: + value = getattr(stat, error_field, None) + if value is not None and value > 0: + is_critical = error_field in self.CRITICAL_ERROR_FIELDS + errors_on_interface.append((error_field, value, is_critical)) + if errors_on_interface: + error_state = True + interface_label = stat.ifname or "unknown" + error_names = [e[0] for e in errors_on_interface] + any_critical = any(e[2] for e in errors_on_interface) + priority = EventPriority.CRITICAL if any_critical else EventPriority.ERROR + errors_data = {field: value for field, value, _ in errors_on_interface} + self._log_event( + category=EventCategory.IO, + description=f"RDMA error detected on {interface_label}: [{', '.join(error_names)}]", + data={ + "interface": stat.ifname, + "port": stat.port, + "errors": errors_data, + "statistic_index": idx, + }, + priority=priority, + console_log=True, + ) + + if error_state: + self.result.message = "RDMA errors detected in statistics" + self.result.status = ExecutionStatus.ERROR + else: + self.result.message = "No RDMA errors detected in statistics" + self.result.status = ExecutionStatus.OK + return self.result diff --git a/nodescraper/plugins/inband/rdma/rdma_collector.py b/nodescraper/plugins/inband/rdma/rdma_collector.py new file mode 100644 index 00000000..2be1547c --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdma_collector.py @@ -0,0 +1,185 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from typing import Optional + +from pydantic import ValidationError + +from nodescraper.base import InBandDataCollector +from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily +from nodescraper.models import TaskResult +from nodescraper.utils import get_exception_traceback + +from .rdmadata import RdmaDataModel, RdmaLink, RdmaStatistics + + +class RdmaCollector(InBandDataCollector[RdmaDataModel, None]): + """Collect RDMA status and statistics via rdma link and rdma statistic commands.""" + + DATA_MODEL = RdmaDataModel + SUPPORTED_OS_FAMILY = {OSFamily.LINUX} + + CMD_LINK = "rdma link -j" + CMD_STATISTIC = "rdma statistic -j" + + def _run_rdma_command(self, cmd: str) -> Optional[list[dict]]: + """Run rdma command with JSON output. + + Args: + cmd: Full command string (e.g. CMD_LINK or CMD_STATISTIC). + + Returns: + List of dicts from JSON output, or None on failure. + """ + res = self._run_sut_cmd(cmd) + + if res.exit_code != 0: + self._log_event( + category=EventCategory.NETWORK, + description=f"Error running rdma command: {cmd}", + data={ + "command": cmd, + "exit_code": res.exit_code, + "stderr": res.stderr, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + + if not res.stdout.strip(): + return [] + + try: + return json.loads(res.stdout) + except json.JSONDecodeError as e: + self._log_event( + category=EventCategory.NETWORK, + description=f"Error parsing command: {cmd} json data", + data={ + "cmd": cmd, + "exception": get_exception_traceback(e), + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + + def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]: + """Get RDMA statistics from 'rdma statistic -j'.""" + stat_data = self._run_rdma_command(self.CMD_STATISTIC) + if stat_data is None: + return None + if not stat_data: + return [] + + try: + statistics = [] + for stat in stat_data: + if not isinstance(stat, dict): + self._log_event( + category=EventCategory.NETWORK, + description="Invalid data type for RDMA statistic", + data={"data_type": type(stat).__name__}, + priority=EventPriority.WARNING, + ) + continue + statistics.append(RdmaStatistics(**stat)) + except ValidationError as e: + self._log_event( + category=EventCategory.NETWORK, + description="Failed to build RdmaStatistics model", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return statistics + + def _get_rdma_link(self) -> Optional[list[RdmaLink]]: + """Get RDMA link data from 'rdma link -j'.""" + link_data = self._run_rdma_command(self.CMD_LINK) + if link_data is None: + return None + if not link_data: + return [] + + try: + links = [] + for link in link_data: + if not isinstance(link, dict): + self._log_event( + category=EventCategory.NETWORK, + description="Invalid data type for RDMA link", + data={"data_type": type(link).__name__}, + priority=EventPriority.WARNING, + ) + continue + links.append(RdmaLink(**link)) + return links + except ValidationError as e: + self._log_event( + category=EventCategory.NETWORK, + description="Failed to build RdmaLink model", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return links + + def collect_data(self, args: None = None) -> tuple[TaskResult, Optional[RdmaDataModel]]: + """Collect RDMA statistics and link data. + + Returns: + Task result and RdmaDataModel, or None if both commands failed. + """ + try: + links = self._get_rdma_link() + statistics = self._get_rdma_statistics() + + if statistics is None and links is None: + self.result.status = ExecutionStatus.EXECUTION_FAILURE + self.result.message = "Failed to collect RDMA data" + return self.result, None + + rdma_data = RdmaDataModel( + statistic_list=statistics if statistics is not None else [], + link_list=links if links is not None else [], + ) + self.result.message = ( + f"Collected {len(rdma_data.statistic_list)} RDMA statistics, " + f"{len(rdma_data.link_list)} RDMA links" + ) + self.result.status = ExecutionStatus.OK + return self.result, rdma_data + + except Exception as e: + self._log_event( + category=EventCategory.NETWORK, + description="Error running RDMA collector", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.ERROR, + console_log=True, + ) + self.result.status = ExecutionStatus.EXECUTION_FAILURE + return self.result, None diff --git a/nodescraper/plugins/inband/rdma/rdma_plugin.py b/nodescraper/plugins/inband/rdma/rdma_plugin.py new file mode 100644 index 00000000..fac85862 --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdma_plugin.py @@ -0,0 +1,38 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .rdma_analyzer import RdmaAnalyzer +from .rdma_collector import RdmaCollector +from .rdmadata import RdmaDataModel + + +class RdmaPlugin(InBandDataPlugin[RdmaDataModel, None, None]): + """Plugin for collection and analysis of RDMA statistics and link data.""" + + DATA_MODEL = RdmaDataModel + COLLECTOR = RdmaCollector + ANALYZER = RdmaAnalyzer diff --git a/nodescraper/plugins/inband/rdma/rdmadata.py b/nodescraper/plugins/inband/rdma/rdmadata.py new file mode 100644 index 00000000..7b1c1a4a --- /dev/null +++ b/nodescraper/plugins/inband/rdma/rdmadata.py @@ -0,0 +1,77 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field, model_validator +from typing_extensions import Self + +from nodescraper.models import DataModel + + +class RdmaStatistics(BaseModel): + """RDMA statistic entry from 'rdma statistic -j'.""" + + model_config = ConfigDict(extra="allow") + + ifname: Optional[str] = None + port: Optional[int] = None + + @model_validator(mode="after") + def validate_at_least_one_field(self) -> Self: + if not self.model_fields_set: + raise ValueError("At least one field must be set in RdmaStatistics") + return self + + +class RdmaLink(BaseModel): + """RDMA link entry from 'rdma link -j'.""" + + ifindex: Optional[int] = None + ifname: Optional[str] = None + port: Optional[int] = None + state: Optional[str] = None + physical_state: Optional[str] = None + netdev: Optional[str] = None + netdev_index: Optional[int] = None + + @model_validator(mode="after") + def validate_at_least_one_field(self) -> Self: + if not self.model_fields_set: + raise ValueError("At least one field must be set in RdmaLink") + return self + + +class RdmaDataModel(DataModel): + """ + Data model for RDMA (Remote Direct Memory Access) statistics and link information. + + Attributes: + statistic_list: List of RDMA statistics from 'rdma statistic -j'. + link_list: List of RDMA links from 'rdma link -j'. + """ + + link_list: list[RdmaLink] = Field(default_factory=list) + statistic_list: list[RdmaStatistics] = Field(default_factory=list) diff --git a/test/functional/fixtures/rdma_plugin_config.json b/test/functional/fixtures/rdma_plugin_config.json new file mode 100644 index 00000000..f62214b3 --- /dev/null +++ b/test/functional/fixtures/rdma_plugin_config.json @@ -0,0 +1,9 @@ +{ + "global_args": {}, + "plugins": { + "RdmaPlugin": {} + }, + "result_collators": {}, + "name": "RdmaPlugin config", + "desc": "Config for testing RdmaPlugin" + } diff --git a/test/functional/test_plugin_configs.py b/test/functional/test_plugin_configs.py index 7f4ea6ce..a0d73aaa 100644 --- a/test/functional/test_plugin_configs.py +++ b/test/functional/test_plugin_configs.py @@ -55,6 +55,7 @@ def plugin_config_files(fixtures_dir): "OsPlugin": fixtures_dir / "os_plugin_config.json", "PackagePlugin": fixtures_dir / "package_plugin_config.json", "ProcessPlugin": fixtures_dir / "process_plugin_config.json", + "RdmaPlugin": fixtures_dir / "rdma_plugin_config.json", "RocmPlugin": fixtures_dir / "rocm_plugin_config.json", "StoragePlugin": fixtures_dir / "storage_plugin_config.json", "SysctlPlugin": fixtures_dir / "sysctl_plugin_config.json", @@ -117,6 +118,7 @@ def test_plugin_config_with_builtin_config(run_cli_command, tmp_path): "OsPlugin", "PackagePlugin", "ProcessPlugin", + "RdmaPlugin", "RocmPlugin", "StoragePlugin", "SysctlPlugin", diff --git a/test/functional/test_rdma_plugin.py b/test/functional/test_rdma_plugin.py new file mode 100644 index 00000000..862de3b8 --- /dev/null +++ b/test/functional/test_rdma_plugin.py @@ -0,0 +1,106 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for RdmaPlugin with --plugin-configs.""" + +from pathlib import Path + +import pytest + + +@pytest.fixture +def fixtures_dir(): + """Return path to fixtures directory.""" + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def rdma_config_file(fixtures_dir): + """Return path to RdmaPlugin config file.""" + return fixtures_dir / "rdma_plugin_config.json" + + +def test_rdma_plugin_with_basic_config(run_cli_command, rdma_config_file, tmp_path): + """Test RdmaPlugin using basic config file.""" + assert rdma_config_file.exists(), f"Config file not found: {rdma_config_file}" + + log_path = str(tmp_path / "logs_rdma_basic") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(rdma_config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + assert "rdmaplugin" in output.lower() or "rdma" in output.lower() + + +def test_rdma_plugin_with_run_plugins_subcommand(run_cli_command, tmp_path): + """Test RdmaPlugin using run-plugins subcommand.""" + log_path = str(tmp_path / "logs_rdma_subcommand") + result = run_cli_command(["--log-path", log_path, "run-plugins", "RdmaPlugin"], check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_rdma_plugin_with_passive_interaction(run_cli_command, rdma_config_file, tmp_path): + """Test RdmaPlugin with PASSIVE system interaction level.""" + log_path = str(tmp_path / "logs_rdma_passive") + result = run_cli_command( + [ + "--log-path", + log_path, + "--sys-interaction-level", + "PASSIVE", + "--plugin-configs", + str(rdma_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_rdma_plugin_skip_sudo(run_cli_command, rdma_config_file, tmp_path): + """Test RdmaPlugin with --skip-sudo flag.""" + log_path = str(tmp_path / "logs_rdma_no_sudo") + result = run_cli_command( + [ + "--log-path", + log_path, + "--skip-sudo", + "--plugin-configs", + str(rdma_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 diff --git a/test/functional/test_reference_config_workflow.py b/test/functional/test_reference_config_workflow.py index 44362149..784ae909 100644 --- a/test/functional/test_reference_config_workflow.py +++ b/test/functional/test_reference_config_workflow.py @@ -238,6 +238,7 @@ def test_reference_config_with_analysis_args(run_cli_command, tmp_path): "OsPlugin", "PackagePlugin", "ProcessPlugin", + "RdmaPlugin", "RocmPlugin", "SysctlPlugin", ] diff --git a/test/functional/test_run_plugins.py b/test/functional/test_run_plugins.py index c7f6c662..e819fcbc 100644 --- a/test/functional/test_run_plugins.py +++ b/test/functional/test_run_plugins.py @@ -62,6 +62,7 @@ def test_plugin_registry_has_plugins(all_plugins): "OsPlugin", "PackagePlugin", "ProcessPlugin", + "RdmaPlugin", "RocmPlugin", "StoragePlugin", "SysctlPlugin", diff --git a/test/unit/plugin/fixtures/rdma_link_example_data.json b/test/unit/plugin/fixtures/rdma_link_example_data.json new file mode 100644 index 00000000..6c228a81 --- /dev/null +++ b/test/unit/plugin/fixtures/rdma_link_example_data.json @@ -0,0 +1,38 @@ +[ + { + "ifindex": 0, + "ifname": "ionic_0", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic8p1", + "netdev_index": 3 + }, + { + "ifindex": 1, + "ifname": "ionic_1", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic7p1", + "netdev_index": 6 + }, + { + "ifindex": 2, + "ifname": "ionic_2", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic5p1", + "netdev_index": 8 + }, + { + "ifindex": 3, + "ifname": "ionic_3", + "port": 1, + "state": "ACTIVE", + "physical_state": "LINK_UP", + "netdev": "benic6p1", + "netdev_index": 9 + } +] diff --git a/test/unit/plugin/fixtures/rdma_statistic_example_data.json b/test/unit/plugin/fixtures/rdma_statistic_example_data.json new file mode 100644 index 00000000..e338e41a --- /dev/null +++ b/test/unit/plugin/fixtures/rdma_statistic_example_data.json @@ -0,0 +1,826 @@ +[ + { + "ifname": "bnxt_re0", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 12, + "watermark_ahs": 8, + "watermark_qps": 229, + "watermark_rc_qps": 220, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 305, + "watermark_mws": 0, + "rx_pkts": 3504998440, + "rx_bytes": 2966950848, + "tx_pkts": 2747190987, + "tx_bytes": 912073550, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3324056122, + "tx_read_resp": 3324056122, + "tx_write_req": 622240024, + "tx_send_req": 97500, + "rx_atomic_requests": 0, + "rx_read_requests": 3324056122, + "rx_read_resp": 3324056122, + "rx_write_requests": 626374468, + "rx_send_req": 97500, + "rx_good_pkts": 1401322762, + "rx_good_bytes": 2966950848, + "out_of_buffer": 0, + "np_cnp_sent": 2873487760, + "rp_cnp_handled": 2103675678, + "np_ecn_marked_roce_packets": 2873487760, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re1", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 14, + "watermark_ahs": 3, + "watermark_qps": 228, + "watermark_rc_qps": 219, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 287, + "watermark_mws": 0, + "rx_pkts": 1509751895, + "rx_bytes": 3099873130, + "tx_pkts": 692925073, + "tx_bytes": 2068663286, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3322387232, + "tx_read_resp": 3322387232, + "tx_write_req": 620621144, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 3322387232, + "rx_read_resp": 3322387232, + "rx_write_requests": 621181433, + "rx_send_req": 0, + "rx_good_pkts": 3507768689, + "rx_good_bytes": 3099873130, + "out_of_buffer": 0, + "np_cnp_sent": 1097578610, + "rp_cnp_handled": 2296950502, + "np_ecn_marked_roce_packets": 1097578610, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re2", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 4, + "watermark_qps": 230, + "watermark_rc_qps": 221, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 294, + "watermark_mws": 0, + "rx_pkts": 2328181128, + "rx_bytes": 79750872, + "tx_pkts": 1404869338, + "tx_bytes": 644434628, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3212760135, + "tx_read_resp": 3212760135, + "tx_write_req": 1995861174, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 3212760135, + "rx_read_resp": 3212760135, + "rx_write_requests": 1995579948, + "rx_send_req": 0, + "rx_good_pkts": 4025638368, + "rx_good_bytes": 79750872, + "out_of_buffer": 0, + "np_cnp_sent": 4174752904, + "rp_cnp_handled": 2597510056, + "np_ecn_marked_roce_packets": 4174752904, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re3", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 12, + "watermark_ahs": 7, + "watermark_qps": 229, + "watermark_rc_qps": 220, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 292, + "watermark_mws": 0, + "rx_pkts": 3888070733, + "rx_bytes": 3748987850, + "tx_pkts": 2265082996, + "tx_bytes": 3715380316, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 3103369202, + "tx_read_resp": 3103369202, + "tx_write_req": 3370635080, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 3103369202, + "rx_read_resp": 3103369202, + "rx_write_requests": 3368547249, + "rx_send_req": 0, + "rx_good_pkts": 2688805201, + "rx_good_bytes": 3748987850, + "out_of_buffer": 0, + "np_cnp_sent": 134598312, + "rp_cnp_handled": 1199265532, + "np_ecn_marked_roce_packets": 134598312, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re4", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 12, + "watermark_ahs": 6, + "watermark_qps": 230, + "watermark_rc_qps": 221, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 302, + "watermark_mws": 0, + "rx_pkts": 986831570, + "rx_bytes": 1185181414, + "tx_pkts": 1975828812, + "tx_bytes": 2763928250, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2993618119, + "tx_read_resp": 2993618119, + "tx_write_req": 449606302, + "tx_send_req": 37687, + "rx_atomic_requests": 0, + "rx_read_requests": 2993618119, + "rx_read_resp": 2993618119, + "rx_write_requests": 448485514, + "rx_send_req": 37687, + "rx_good_pkts": 2876478595, + "rx_good_bytes": 1185181414, + "out_of_buffer": 0, + "np_cnp_sent": 3525492995, + "rp_cnp_handled": 2405320271, + "np_ecn_marked_roce_packets": 3525492995, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re5", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 7, + "watermark_qps": 228, + "watermark_rc_qps": 219, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 287, + "watermark_mws": 0, + "rx_pkts": 3602164391, + "rx_bytes": 515322372, + "tx_pkts": 3498885620, + "tx_bytes": 3601952844, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2883798845, + "tx_read_resp": 2883798845, + "tx_write_req": 1822414941, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 2883798845, + "rx_read_resp": 2883798845, + "rx_write_requests": 1819507161, + "rx_send_req": 0, + "rx_good_pkts": 1576292710, + "rx_good_bytes": 515322372, + "out_of_buffer": 0, + "np_cnp_sent": 4093842522, + "rp_cnp_handled": 2025871681, + "np_ecn_marked_roce_packets": 4093842522, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re6", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 7, + "watermark_qps": 230, + "watermark_rc_qps": 221, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 95, + "watermark_mrs": 294, + "watermark_mws": 0, + "rx_pkts": 2577272275, + "rx_bytes": 2249875450, + "tx_pkts": 2452138468, + "tx_bytes": 700557582, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2775090592, + "tx_read_resp": 2775090592, + "tx_write_req": 3201764210, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 2775090592, + "rx_read_resp": 2775090592, + "rx_write_requests": 3201655162, + "rx_send_req": 0, + "rx_good_pkts": 1197866395, + "rx_good_bytes": 2249875450, + "out_of_buffer": 0, + "np_cnp_sent": 2401103251, + "rp_cnp_handled": 1379405880, + "np_ecn_marked_roce_packets": 2401103251, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + }, + { + "ifname": "bnxt_re7", + "port": 1, + "active_pds": 1, + "active_ahs": 0, + "active_qps": 1, + "active_rc_qps": 0, + "active_ud_qps": 0, + "active_srqs": 0, + "active_cqs": 1, + "active_mrs": 0, + "active_mws": 0, + "watermark_pds": 13, + "watermark_ahs": 6, + "watermark_qps": 228, + "watermark_rc_qps": 219, + "watermark_ud_qps": 8, + "watermark_srqs": 8, + "watermark_cqs": 94, + "watermark_mrs": 287, + "watermark_mws": 0, + "rx_pkts": 1606921676, + "rx_bytes": 4007942950, + "tx_pkts": 1249198409, + "tx_bytes": 25134278, + "recoverable_errors": 0, + "tx_roce_errors": 0, + "tx_roce_discards": 0, + "rx_roce_errors": 0, + "rx_roce_discards": 0, + "local_ack_timeout_err": 0, + "packet_seq_err": 0, + "max_retry_exceeded": 0, + "rnr_nak_retry_err": 0, + "implied_nak_seq_err": 0, + "unrecoverable_err": 0, + "bad_resp_err": 0, + "local_qp_op_err": 0, + "local_protection_err": 0, + "mem_mgmt_op_err": 0, + "req_remote_invalid_request": 0, + "req_remote_access_errors": 0, + "remote_op_err": 0, + "duplicate_request": 0, + "res_exceed_max": 0, + "resp_local_length_error": 0, + "res_exceeds_wqe": 0, + "res_opcode_err": 0, + "res_rx_invalid_rkey": 0, + "res_rx_domain_err": 0, + "res_rx_no_perm": 0, + "res_rx_range_err": 0, + "res_tx_invalid_rkey": 0, + "res_tx_domain_err": 0, + "res_tx_no_perm": 0, + "res_tx_range_err": 0, + "res_irrq_oflow": 0, + "res_unsup_opcode": 0, + "res_unaligned_atomic": 0, + "res_rem_inv_err": 0, + "res_mem_err": 0, + "res_srq_err": 0, + "res_cmp_err": 0, + "res_invalid_dup_rkey": 0, + "res_wqe_format_err": 0, + "res_cq_load_err": 0, + "res_srq_load_err": 0, + "res_tx_pci_err": 0, + "res_rx_pci_err": 0, + "tx_atomic_req": 0, + "tx_read_req": 2665758274, + "tx_read_resp": 2665758274, + "tx_write_req": 284646587, + "tx_send_req": 0, + "rx_atomic_requests": 0, + "rx_read_requests": 2665758274, + "rx_read_resp": 2665758274, + "rx_write_requests": 284542358, + "rx_send_req": 0, + "rx_good_pkts": 253070639, + "rx_good_bytes": 4007942950, + "out_of_buffer": 0, + "np_cnp_sent": 2670842510, + "rp_cnp_handled": 1353851037, + "np_ecn_marked_roce_packets": 2670842510, + "out_of_sequence": 0, + "pacing_reschedule": 0, + "pacing_complete": 0, + "pacing_alerts": 0, + "db_fifo_register": 2147450881, + "req_cqe_error": 0, + "req_cqe_flush_error": 0, + "resp_cqe_error": 0, + "resp_cqe_flush_error": 0, + "resp_remote_access_errors": 0, + "roce_adp_retrans": 0, + "roce_adp_retrans_to": 0, + "roce_slow_restart": 0, + "roce_slow_restart_cnps": 0, + "roce_slow_restart_trans": 0, + "rp_cnp_ignored": 0, + "rx_icrc_encapsulated": 0 + } +] diff --git a/test/unit/plugin/test_rdma_analyzer.py b/test/unit/plugin/test_rdma_analyzer.py new file mode 100644 index 00000000..c7b1dfd8 --- /dev/null +++ b/test/unit/plugin/test_rdma_analyzer.py @@ -0,0 +1,279 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from pathlib import Path + +import pytest + +from nodescraper.enums import EventPriority, ExecutionStatus +from nodescraper.plugins.inband.rdma.rdma_analyzer import RdmaAnalyzer +from nodescraper.plugins.inband.rdma.rdmadata import ( + RdmaDataModel, + RdmaLink, + RdmaStatistics, +) + + +@pytest.fixture +def rdma_analyzer(system_info): + return RdmaAnalyzer(system_info) + + +@pytest.fixture +def plugin_fixtures_path(): + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def clean_rdma_model(plugin_fixtures_path): + """RDMA data with no errors (all counters zero).""" + path = plugin_fixtures_path / "rdma_statistic_example_data.json" + data = json.loads(path.read_text()) + stats = [RdmaStatistics(**s) for s in data] + return RdmaDataModel(statistic_list=stats) + + +@pytest.fixture +def clean_stats(plugin_fixtures_path): + """List of clean RdmaStatistics (no errors) for building models with links.""" + path = plugin_fixtures_path / "rdma_statistic_example_data.json" + data = json.loads(path.read_text()) + return [RdmaStatistics(**s) for s in data] + + +def test_no_errors_detected(rdma_analyzer, clean_rdma_model): + """Test with nominal data that has no errors.""" + result = rdma_analyzer.analyze_data(clean_rdma_model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + +def test_single_error_detected(rdma_analyzer, clean_rdma_model): + """Test with data containing a single error.""" + stats = list(clean_rdma_model.statistic_list) + stats[0].tx_roce_errors = 5 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "RDMA errors detected in statistics" in result.message + assert len(result.events) == 1 + assert result.events[0].description == "RDMA error detected on bnxt_re0: [tx_roce_errors]" + assert result.events[0].priority == EventPriority.ERROR + assert result.events[0].data["errors"] == {"tx_roce_errors": 5} + assert result.events[0].data["interface"] == "bnxt_re0" + + +def test_multiple_errors_detected(rdma_analyzer, clean_rdma_model): + """Test with data containing multiple errors (grouped per interface).""" + stats = list(clean_rdma_model.statistic_list) + stats[0].tx_roce_errors = 10 + stats[0].rx_roce_errors = 3 + stats[1].packet_seq_err = 7 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "RDMA errors detected in statistics" in result.message + assert len(result.events) == 2 # one per interface + for event in result.events: + assert event.priority == EventPriority.ERROR + # Total 3 errors across 2 interfaces + assert sum(len(e.data["errors"]) for e in result.events) == 3 + + +def test_critical_error_detected(rdma_analyzer, clean_rdma_model): + """Test with data containing a critical error (grouped per interface).""" + stats = list(clean_rdma_model.statistic_list) + stats[0].unrecoverable_err = 1 + stats[0].res_tx_pci_err = 2 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "RDMA errors detected in statistics" in result.message + assert len(result.events) == 1 # one event per interface + assert result.events[0].priority == EventPriority.CRITICAL + assert "unrecoverable_err" in result.events[0].data["errors"] + assert "res_tx_pci_err" in result.events[0].data["errors"] + + +def test_empty_statistics(rdma_analyzer): + """Test with empty statistics list.""" + model = RdmaDataModel(statistic_list=[], link_list=[]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.NOT_RAN + assert result.message == "RDMA statistics list is empty" + + +def test_multiple_interfaces_with_errors(rdma_analyzer, clean_rdma_model): + """Test with errors across multiple interfaces.""" + stats = list(clean_rdma_model.statistic_list) + stats[0].max_retry_exceeded = 15 + stats[2].local_ack_timeout_err = 8 + stats[4].out_of_buffer = 100 + model = RdmaDataModel(statistic_list=stats) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 3 + interfaces = {event.data["interface"] for event in result.events} + assert len(interfaces) == 3 + + +def test_all_error_types(rdma_analyzer): + """Test that all error fields are properly detected (grouped in one event).""" + stats = RdmaStatistics( + ifname="bnxt_re_test", + port=1, + recoverable_errors=1, + tx_roce_errors=1, + unrecoverable_err=1, + ) + model = RdmaDataModel(statistic_list=[stats]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert len(result.events) == 1 # one event per interface + assert "unrecoverable_err" in result.events[0].data["errors"] + assert result.events[0].priority == EventPriority.CRITICAL + assert set(result.events[0].data["errors"].keys()) == { + "recoverable_errors", + "tx_roce_errors", + "unrecoverable_err", + } + + +def test_zero_errors_are_ignored(rdma_analyzer): + """Test that zero-value errors are not reported.""" + stats = RdmaStatistics( + ifname="bnxt_re_test", + port=1, + tx_roce_errors=0, + rx_roce_errors=0, + unrecoverable_err=0, + ) + model = RdmaDataModel(statistic_list=[stats]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + +def test_rdma_link_all_active(rdma_analyzer, clean_stats): + """Test with RDMA links that are all active and up.""" + links = [ + RdmaLink( + ifindex=0, + ifname="ionic_0", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic0p1", + netdev_index=3, + ), + RdmaLink( + ifindex=1, + ifname="ionic_1", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic1p1", + netdev_index=4, + ), + ] + model = RdmaDataModel(statistic_list=clean_stats, link_list=links) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert result.message == "No RDMA errors detected in statistics" + assert len(result.events) == 0 + + +def test_rdma_link_down_detected(rdma_analyzer, clean_stats): + """Test with RDMA links that are down""" + links = [ + RdmaLink( + ifindex=0, + ifname="ionic_0", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic0p1", + netdev_index=3, + ), + RdmaLink( + ifindex=1, + ifname="ionic_1", + port=1, + state="DOWN", + physical_state="LINK_DOWN", + netdev="benic1p1", + netdev_index=4, + ), + ] + model = RdmaDataModel(statistic_list=clean_stats, link_list=links) + result = rdma_analyzer.analyze_data(model) + # Current implementation only checks statistics, not link state + assert result.status == ExecutionStatus.OK + + +def test_rdma_link_empty_list(rdma_analyzer, clean_stats): + """Test with empty RDMA link list.""" + model = RdmaDataModel(statistic_list=clean_stats, link_list=[]) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert result.message == "No RDMA errors detected in statistics" + + +def test_rdma_link_multiple_interfaces(rdma_analyzer, clean_stats): + """Test with multiple RDMA interfaces with different link states.""" + links = [ + RdmaLink( + ifindex=0, + ifname="ionic_0", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic0p1", + netdev_index=3, + ), + RdmaLink( + ifindex=1, + ifname="ionic_1", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic1p1", + netdev_index=4, + ), + RdmaLink( + ifindex=2, + ifname="ionic_2", + port=1, + state="ACTIVE", + physical_state="LINK_UP", + netdev="benic2p1", + netdev_index=5, + ), + ] + model = RdmaDataModel(statistic_list=clean_stats, link_list=links) + result = rdma_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 diff --git a/test/unit/plugin/test_rdma_collector.py b/test/unit/plugin/test_rdma_collector.py new file mode 100644 index 00000000..0343a588 --- /dev/null +++ b/test/unit/plugin/test_rdma_collector.py @@ -0,0 +1,101 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from pathlib import Path + +import pytest + +from nodescraper.connection.inband.inband import CommandArtifact +from nodescraper.enums import ExecutionStatus, OSFamily +from nodescraper.enums.systeminteraction import SystemInteractionLevel +from nodescraper.plugins.inband.rdma.rdma_collector import RdmaCollector +from nodescraper.plugins.inband.rdma.rdmadata import RdmaDataModel + + +@pytest.fixture +def collector(system_info, conn_mock): + return RdmaCollector( + system_info=system_info, + system_interaction_level=SystemInteractionLevel.PASSIVE, + connection=conn_mock, + ) + + +@pytest.fixture +def rdma_statistic_output(): + path = Path(__file__).parent / "fixtures" / "rdma_statistic_example_data.json" + return path.read_text() + + +@pytest.fixture +def rdma_link_output(): + path = Path(__file__).parent / "fixtures" / "rdma_link_example_data.json" + return path.read_text() + + +def test_collect_success(collector, conn_mock, rdma_link_output, rdma_statistic_output): + """Successful collection returns RdmaDataModel with statistics and links (full fixtures).""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout=rdma_link_output, stderr="", command="rdma link -j"), + CommandArtifact( + exit_code=0, stdout=rdma_statistic_output, stderr="", command="rdma statistic -j" + ), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert isinstance(data, RdmaDataModel) + # Full statistic fixture has 8 devices (bnxt_re0..bnxt_re7) with full stats + assert len(data.statistic_list) == 8 + assert data.statistic_list[0].ifname == "bnxt_re0" + # Full link fixture has 4 ionic links + assert len(data.link_list) == 4 + assert data.link_list[0].ifname == "ionic_0" + + +def test_collect_both_commands_fail(collector, conn_mock): + """When both rdma commands fail, status is EXECUTION_FAILURE and data is None.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.return_value = CommandArtifact( + exit_code=1, stdout="", stderr="rdma command failed", command="rdma link -j" + ) + res, data = collector.collect_data() + assert res.status == ExecutionStatus.EXECUTION_FAILURE + assert data is None + + +def test_collect_empty_output(collector, conn_mock): + """Empty JSON arrays yield empty lists in model.""" + collector.system_info.os_family = OSFamily.LINUX + conn_mock.run_command.side_effect = [ + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma link -j"), + CommandArtifact(exit_code=0, stdout="[]", stderr="", command="rdma statistic -j"), + ] + res, data = collector.collect_data() + assert res.status == ExecutionStatus.OK + assert data is not None + assert data.link_list == [] + assert data.statistic_list == []