diff --git a/nodescraper/models/taskresult.py b/nodescraper/models/taskresult.py index f4dbb251..dd5990ef 100644 --- a/nodescraper/models/taskresult.py +++ b/nodescraper/models/taskresult.py @@ -103,28 +103,40 @@ def duration(self) -> Optional[str]: return duration def _get_event_summary(self) -> str: - """Get summary string for artifacts + """Get summary string for events Returns: - str: artifact summary + str: event summary with counts and descriptions """ - error_count = 0 - warning_count = 0 + error_msg_counts: dict[str, int] = {} + warning_msg_counts: dict[str, int] = {} for event in self.events: if event.priority == EventPriority.WARNING: - warning_count += 1 + warning_msg_counts[event.description] = ( + warning_msg_counts.get(event.description, 0) + 1 + ) elif event.priority >= EventPriority.ERROR: - error_count += 1 - - summary_list = [] - - if warning_count: - summary_list.append(f"{warning_count} warnings") - if error_count: - summary_list.append(f"{error_count} errors") - - return "|".join(summary_list) + error_msg_counts[event.description] = error_msg_counts.get(event.description, 0) + 1 + + summary_parts = [] + + if warning_msg_counts: + total_warnings = sum(warning_msg_counts.values()) + warning_details = [ + f"{msg} (x{count})" if count > 1 else msg + for msg, count in warning_msg_counts.items() + ] + summary_parts.append(f"{total_warnings} warnings: {', '.join(warning_details)}") + + if error_msg_counts: + total_errors = sum(error_msg_counts.values()) + error_details = [ + f"{msg} (x{count})" if count > 1 else msg for msg, count in error_msg_counts.items() + ] + summary_parts.append(f"{total_errors} errors: {', '.join(error_details)}") + + return "; ".join(summary_parts) def _update_status(self) -> None: """Update overall status based on event priority""" diff --git a/nodescraper/plugins/inband/pcie/__init__.py b/nodescraper/plugins/inband/pcie/__init__.py new file mode 100644 index 00000000..baeb9851 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/__init__.py @@ -0,0 +1,29 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .analyzer_args import PcieAnalyzerArgs +from .pcie_plugin import PciePlugin + +__all__ = ["PciePlugin", "PcieAnalyzerArgs"] diff --git a/nodescraper/plugins/inband/pcie/analyzer_args.py b/nodescraper/plugins/inband/pcie/analyzer_args.py new file mode 100644 index 00000000..dc3490a4 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/analyzer_args.py @@ -0,0 +1,63 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Dict, Optional, Union + +from nodescraper.models import AnalyzerArgs + + +class PcieAnalyzerArgs(AnalyzerArgs): + """Arguments for PCIe analyzer + + Attributes: + exp_speed: Expected PCIe speed (generation 1-5) + exp_width: Expected PCIe width (1-16 lanes) + exp_sriov_count: Expected SR-IOV VF count + exp_gpu_count_override: Override expected GPU count + exp_max_payload_size: Expected max payload size (int for all devices, dict for specific device IDs) + exp_max_rd_req_size: Expected max read request size (int for all devices, dict for specific device IDs) + exp_ten_bit_tag_req_en: Expected 10-bit tag request enable (int for all devices, dict for specific device IDs) + """ + + exp_speed: int = 5 + exp_width: int = 16 + exp_sriov_count: int = 0 + exp_gpu_count_override: Optional[int] = None + exp_max_payload_size: Optional[Union[Dict[int, int], int]] = None + exp_max_rd_req_size: Optional[Union[Dict[int, int], int]] = None + exp_ten_bit_tag_req_en: Optional[Union[Dict[int, int], int]] = None + + +def normalize_to_dict( + value: Optional[Union[Dict[int, int], int]], vendorid_ep: int +) -> Dict[int, int]: + """Normalize int or dict values to dict format using vendorid_ep as key for int values""" + if value is None: + return {} + if isinstance(value, int): + return {vendorid_ep: value} + if isinstance(value, dict): + return value + return {} diff --git a/nodescraper/plugins/inband/pcie/pcie_analyzer.py b/nodescraper/plugins/inband/pcie/pcie_analyzer.py new file mode 100755 index 00000000..7d9a7e58 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_analyzer.py @@ -0,0 +1,1081 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Dict, List, Optional, Set, Type, TypeVar + +from pydantic import BaseModel, Field, ValidationError, field_validator + +from nodescraper.enums import EventCategory, EventPriority +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult +from nodescraper.utils import get_exception_traceback + +from .analyzer_args import PcieAnalyzerArgs, normalize_to_dict +from .pcie_data import ( + BdfStr, + CorrErrMaskReg, + CorrErrStatReg, + ECap16Gt, + ECapAer, + ECapSecpci, + ParityMisMatchStat16GT, + PcieCapStructure, + PcieCfgSpace, + PcieDataModel, + PcieExp, + PcieRegister, + UncorrErrMaskReg, + UncorrErrSevReg, + UncorrErrStatReg, +) + +T_CAP = TypeVar("T_CAP", bound=PcieCapStructure) + + +class PcieAnalyzerInputModel(BaseModel): + """ + PCIeAnalyzerInputModel is a data model for validating and storing input parameters + related to PCIe (Peripheral Component Interconnect Express) analysis. + Attributes: + exp_speed (int): Expected PCIe speed, Speed is the PCIe Generation, constrained to values between 1 and 5 (inclusive). + exp_width (int): Expected PCIe width, constrained to values between 1 and 16 (inclusive). + exp_sriov_count (Optional[int]): Optional expected count of SR-IOV (Single Root I/O Virtualization) instances. + exp_gpu_count_override (Optional[int]): Optional override for the expected GPU count. + """ + + exp_speed: int = Field(ge=1, le=5) + exp_width: int = Field(ge=1, le=16) + exp_sriov_count: Optional[int] = None + exp_gpu_count_override: Optional[int] = None + exp_max_payload_size: Dict[int, int] = Field(default_factory=dict) + exp_max_rd_req_size: Dict[int, int] = Field(default_factory=dict) + exp_ten_bit_tag_req_en: Dict[int, int] = Field(default_factory=dict) + + @field_validator("exp_max_rd_req_size", "exp_max_payload_size", mode="before") + @classmethod + def validate_exp_max_rd_req_size(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]: + """Validates the expected maximum read request size.""" + if v is None: + return {} + ret_dict = v.copy() + for key, value in v.items(): + if value >= 0 and value <= 5: + ret_dict[key] = 128 << value # Convert to actual size in bytes + if value not in {128, 256, 512, 1024, 2048, 4096}: + raise ValueError( + "Expected max read request size must be one of: " + "1, 2, 3, 4, 5, 128, 256, 512, 1024, 2048, or 4096." + ) + if key < 0 or key > 0xFFFF: + raise ValueError(" key must be a valid BDF (0-65535).") + return ret_dict + + @field_validator("exp_ten_bit_tag_req_en", mode="before") + @classmethod + def validate_exp_ten_bit_tag_req_en(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]: + """Validates the expected 10-bit tag request enable value.""" + if v is None: + return {} + for key, value in v.items(): + if key < 0 or key > 0xFFFF: + raise ValueError("Key must be a valid BDF (0-65535).") + if value not in {0, 1}: + raise ValueError("Expected 10-bit tag request enable must be 0 or 1.") + return v + + +class PcieAnalyzer(DataAnalyzer): + """Check PCIe Data for errors + + This calls checks the following: + - PCIe link status for each BDF + - This checks if the link speed and width are as expected + - AER uncorrectable errors + - Checks PCIe AER uncorrectable error registers UNCORR_ERR_STAT_REG and reports any errors + - AER correctable errors + - Checks the AERs correctable error registers CORR_ERR_STAT_REG and reports any errors + - PCIe device status errors + - Checks PCIe device status errors reported in fields `CORR_ERR_DET` `NON_FATAL_ERR_DET` `FATAL_ERR_DET` `UR_DET` + - PCIe status errors + - Checks PCIe status errors reported in fields `MSTR_DATA_PAR_ERR` `SIGNALED_TARGET_ABORT` `RCVD_TARGET_ABORT` + `RCVD_MSTR_ABORT` `SIGNALED_SYS_ERR` `DET_PARITY_ERR` + + """ + + DATA_MODEL = PcieDataModel + + GPU_BRIDGE_USP_ID = "0x1501" + GPU_BRIDGE_DSP_ID = "0x1500" + + def validate_reg(self, bdf: str, reg: PcieRegister, log_event: bool) -> bool: + """Ensures that the register has no error has has a value + + Parameters + ---------- + bdf : str + base:device:function string just used for logging + reg : PcieRegister + Register to validate + log_event : bool + Whether to log an event if the register is invalid + + Returns + ------- + bool + True when validate successfully, False otherwise + """ + if reg.val is None or reg.err is not None: + if log_event: + self._log_event( + category=EventCategory.IO, + description="No value assgined to register or register collection resulted in error", + priority=EventPriority.WARNING, + data={"value": reg.val, "error": reg.err, "bdf": bdf}, + ) + return False + return True + + def validate_cap( + self, + bdf: str, + name: str, + capability_structure: Optional[PcieCapStructure], + log_event: bool = True, + ) -> bool: + """Ensures that the capability structure has no error and exists + + Parameters + ---------- + bdf : str + base:device:function string just used for logging + capability_structure : PcieCapStructure + Capability structure to validate + + Returns + ------- + bool + True when validate successfully, False otherwise + """ + if capability_structure is None: + if log_event: + self._log_event( + category=EventCategory.IO, + description="No value assgined to capability a structure ", + data={ + "name": name, + "bdf": bdf, + }, + priority=EventPriority.WARNING, + ) + return False + null_regs = capability_structure.null_err_regs() + if null_regs: + if log_event: + self._log_event( + category=EventCategory.IO, + description="Capability structure has unset registers", + data={ + "name": name, + "bdf": bdf, + "capability_structure": capability_structure, + "null_regs": null_regs, + }, + priority=EventPriority.WARNING, + ) + return False + return True + + def validate_cap_dict( + self, + pcie_cfg_space: Dict[BdfStr, PcieCfgSpace], + cap_struct: Type[PcieCapStructure], + log_event: bool = True, + ) -> set[str]: + """Validates capability structures for all BDFs in the PCIe data + + Parameters + ---------- + pcie_data : PCIeData + The PCIe data containing configuration space for each BDF + cap_struct : Type[PcieCapStructure] + The capability structure type to validate against each BDF's configuration space + log_event : bool, optional + Whether to log an event if a BDF does not have the specified capability structure, by default True + + Returns + ------- + set[str] + A set of BDFs that have the specified capability structure + """ + bdf_without_cap_struct = set() + for bdf, cfg_space in pcie_cfg_space.items(): + cap_struct_data = cfg_space.get_struct(cap_struct) + if not self.validate_cap(bdf, cap_struct.__name__, cap_struct_data, False): + bdf_without_cap_struct.add(bdf) + if log_event and len(bdf_without_cap_struct) > 0: + self._log_event( + category=EventCategory.IO, + description=f"Capability Structure {cap_struct.__name__} not found in a Cfg Space", + priority=EventPriority.WARNING, + data={ + "bdf_without_pcie_exp": list(bdf_without_cap_struct), + "num_bdfs_with_invalid_capability_structure": len(bdf_without_cap_struct), + "total_bdfs": len(pcie_cfg_space), + }, + ) + return set(pcie_cfg_space.keys()) - bdf_without_cap_struct + + def get_valid_cap_dict( + self, + pcie_cfg_space: Dict[BdfStr, PcieCfgSpace], + cap_struct: Type[T_CAP], + log_event: bool = True, + ) -> dict[BdfStr, T_CAP]: + """Returns a dictionary of BDFs that have the specified capability structure + + Parameters + ---------- + pcie_data : PCIeData + The PCIe data containing configuration space for each BDF + cap_struct : Type[T_CAP] + The capability structure type to validate against each BDF's configuration space + log_event : bool, optional + Whether to log an event if a BDF does not have the specified capability structure, by default True + + Returns + ------- + dict[BdfStr, T_CAP] + A dictionary of BDFs that have the specified capability structure + """ + bdfs_with_cap = self.validate_cap_dict(pcie_cfg_space, cap_struct, log_event=log_event) + bdf_cap_struct_dict: Dict[BdfStr, T_CAP] = {} + for bdf, cfg_space in pcie_cfg_space.items(): + if bdf not in bdfs_with_cap: + continue + cap_struct_data = cfg_space.get_struct(cap_struct) + if cap_struct_data is None: + continue + bdf_cap_struct_dict[bdf] = cap_struct_data + + return bdf_cap_struct_dict + + def check_link_status( + self, + bdf_pcie_express_dict: Dict[str, PcieExp], + exp_speed: int = 5, + exp_width: int = 16, + ): + """Checks PCIe link status for each bdf in the bdf_list and compares with the expected rate/width + + Args: + all_bdf_cfg_space (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + exp_speed (int): expected link speed + exp_width (int): expected link width + + Returns: + None + """ + # Key: binary bit position, value: Gen + sv_gen_speed = { + 0b000000: 0, + 0b000001: 1, + 0b000010: 2, + 0b000100: 3, + 0b001000: 4, + 0b010000: 5, + } + for bdf, pcie_exp in bdf_pcie_express_dict.items(): + lnk_stat_reg = pcie_exp.lnk_stat_reg + lnk_cap_2_reg = pcie_exp.lnk_cap_2_reg + try: + if lnk_stat_reg.curr_lnk_speed.val == 0: + self._log_event( + category=EventCategory.IO, + description="Link speed vector is 0", + data={ + "bdf": bdf, + "curr_lnk_speed": lnk_stat_reg.curr_lnk_speed.val, + "supported_lnk_speed_vec": lnk_cap_2_reg.supported_lnk_speed_vec.val, + }, + priority=EventPriority.ERROR, + ) + continue + + curr_speed = lnk_stat_reg.curr_lnk_speed.get_val() + supported_vec = lnk_cap_2_reg.supported_lnk_speed_vec.get_val() + if curr_speed is None or supported_vec is None: + continue + sv_mask = 0b1 << (curr_speed - 1) + link_speed = sv_gen_speed[sv_mask & supported_vec] + + if link_speed != exp_speed: + self._log_event( + category=EventCategory.IO, + description="Unexpected link speed detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_speed": link_speed, + "expected_speed": exp_speed, + }, + ) + if lnk_stat_reg.neg_lnk_width.get_val() != exp_width: + self._log_event( + category=EventCategory.IO, + description="Unexpected link width detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_width": lnk_stat_reg.neg_lnk_width.get_val(), + "expected_width": exp_width, + }, + ) + except Exception as e: + self._log_event( + category=EventCategory.IO, + description="Exception occurred while checking link status", + priority=EventPriority.ERROR, + data={"exception": get_exception_traceback(e)}, + ) + + def check_uncorr_aer_errors( + self, + bdf_ecap_aer: Dict[BdfStr, ECapAer], + ): + """ + Checks the following AER uncorrectable error registers + - Uncorrectable Error Status Register + - Uncorrectable Error Mask Register + - Uncorrectable Error Severity Register + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, ecap_aer in bdf_ecap_aer.items(): + stat_reg: UncorrErrStatReg = ecap_aer.uncorr_err_stat + mask_reg: UncorrErrMaskReg = ecap_aer.uncorr_err_mask + sev_reg: UncorrErrSevReg = ecap_aer.uncorr_err_sev + stat_fields = stat_reg.bit_fields + mask_fields = mask_reg.bit_fields + sev_fields = sev_reg.bit_fields + # sort fields by bit position using offset + sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask) + sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask) + sorted_sev_fields = sorted(sev_fields.values(), key=lambda x: x.bit_mask) + # Iterate through all the fields in the stat, mask, and sev registers + for stat_field, mask_field, sev_field in zip( + sorted_stat_fields, + sorted_mask_fields, + sorted_sev_fields, + ): + pcie_field_stat_value = stat_field.get_val() + pcie_field_mask_value = mask_field.get_val() + pcie_field_sev_value = sev_field.get_val() + err_descriptor: Dict[str, str] = { + "bdf": bdf, + "reg_name": stat_reg.__class__.__name__, + "field_desc": stat_field.desc, + "stat": ( + hex(pcie_field_stat_value) if pcie_field_stat_value is not None else "None" + ), + "mask": ( + hex(pcie_field_mask_value) if pcie_field_mask_value is not None else "None" + ), + "sev": ( + hex(pcie_field_sev_value) if pcie_field_sev_value is not None else "None" + ), + } + if pcie_field_stat_value != 0: + # Error detected + if pcie_field_sev_value != 1: + if pcie_field_mask_value == 1: + self._log_event( + category=EventCategory.IO, + description="Masked Fatal errors were detected", + priority=EventPriority.ERROR, + data=err_descriptor, + ) + else: + self._log_event( + category=EventCategory.IO, + description="Unmasked Fatal errors were detected", + priority=EventPriority.ERROR, + data=err_descriptor, + ) + else: + if pcie_field_mask_value == 1: + self._log_event( + category=EventCategory.IO, + description="Unmasked Non-Fatal errors were detected", + priority=EventPriority.WARNING, + data=err_descriptor, + ) + else: + self._log_event( + category=EventCategory.IO, + description="Unmasked Non-Fatal errors were detected", + priority=EventPriority.WARNING, + data=err_descriptor, + ) + + def check_corr_aer_errors( + self, + bdf_ecap_aer: Dict[BdfStr, ECapAer], + ): + """ + Checks the following AER correctable error registers + - Correctable Error Status Register + - Correctable Error Mask Register + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, ecap_aer in bdf_ecap_aer.items(): + stat_reg: CorrErrStatReg = ecap_aer.corr_err_stat + mask_reg: CorrErrMaskReg = ecap_aer.corr_err_mask + stat_fields = stat_reg.bit_fields + mask_fields = mask_reg.bit_fields + sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask) + sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask) + + for stat_field, mask_field in zip( + sorted_stat_fields, + sorted_mask_fields, + ): + stat_val = stat_field.get_val() + if stat_val is not None and stat_val != 0: + err_dict = { + "bdf": bdf, + "reg_description": stat_reg.desc, + "field_description": stat_field.desc, + "bit_field_val": hex(stat_val), + } + if mask_field.get_val() == 1: + self._log_event( + category=EventCategory.IO, + description="Masked Correctable errors were detected", + priority=EventPriority.WARNING, + data=err_dict, + ) + else: + self._log_event( + category=EventCategory.IO, + description="Masked Correctable errors were detected", + priority=EventPriority.ERROR, + data=err_dict, + ) + + def check_pcie_device_status_errors(self, bdf_pcie_express_dict: Dict[str, PcieExp]): + """ + Checks PCIe baseline error reported in Device Status Register + Reference: 9.4.1 Baseline Error Reporting + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, pcie_exp_cap in bdf_pcie_express_dict.items(): + err_list = [] + dev_stat_reg = pcie_exp_cap.dev_stat_reg + bit_field_list = [ + dev_stat_reg.corr_err_det, + dev_stat_reg.non_fatal_err_det, + dev_stat_reg.fatal_err_det, + dev_stat_reg.ur_det, + ] + err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0] + + if len(err_list) > 0: + self._log_event( + category=EventCategory.IO, + description="Device Status errors were detected", + priority=EventPriority.WARNING, + data={ + "bdf": bdf, + "reg_description": dev_stat_reg.desc, + "field_desc_list": [err.desc for err in err_list], + "err_bitmask_list": [err.bit_mask for err in err_list], + "register_value": dev_stat_reg.val, + }, + ) + + def check_pcie_status_errors(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]): + """ + Checks PCIe baseline error reported in Status Registe + Reference: 9.4.1 Baseline Error Reporting + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + Returns: + None + """ + for bdf, cfg_space in bdf_cfg_space_dict.items(): + err_list = [] + stat_reg = cfg_space.type_0_configuration.status + bit_field_list = [ + stat_reg.mstr_data_par_err, + stat_reg.signaled_target_abort, + stat_reg.rcvd_target_abort, + stat_reg.rcvd_mstr_abort, + stat_reg.signaled_sys_err, + stat_reg.det_parity_err, + ] + err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0] + + if len(err_list) > 0: + self._log_event( + category=EventCategory.IO, + description="PCI Express Status register errors were detected", + priority=EventPriority.WARNING, + data={ + "bdf": bdf, + "reg_description": stat_reg.desc, + "field_desc_list": [err.desc for err in err_list], + "err_bitmask_list": [err.bit_mask for err in err_list], + "register_value": stat_reg.val, + }, + ) + + def check_pcie_dev_ctrl_reg( + self, + bdf_pcie_express_dict: Dict[str, PcieExp], + exp_max_payload_size: Optional[int], + exp_max_rd_req_size: Optional[int], + ): + """Checks 7.5.3.4 Device Control Register (Offset 08h) fields for expected value: + - Max Payload Size + - Max Read Request Size + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + exp_max_payload_size (Optional[int]): expected max payload size, when None it is not checked + exp_max_rd_req_size (Optional[int]): expected max read request size, when None it is not checked + Returns: + None + """ + encoding = { + 0b000: 128, + 0b001: 256, + 0b010: 512, + 0b011: 1024, + 0b100: 2048, + 0b101: 4096, + } + for bdf, pcie_exp in bdf_pcie_express_dict.items(): + dev_ctrl_reg = pcie_exp.dev_ctrl_reg + mps_val = dev_ctrl_reg.mps.get_val() + if mps_val is None: + continue + max_payload_size = encoding[mps_val] + if exp_max_payload_size is not None and max_payload_size != exp_max_payload_size: + self._log_event( + category=EventCategory.IO, + description="Unexpected Max Payload Size detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_max_payload_size": max_payload_size, + "expected_max_payload_size": exp_max_payload_size, + }, + ) + + max_rd_req_val = dev_ctrl_reg.max_rd_req_size.get_val() + if max_rd_req_val is None: + continue + max_rd_req_size = encoding[max_rd_req_val] + if max_rd_req_size is not None and max_rd_req_size != exp_max_rd_req_size: + self._log_event( + category=EventCategory.IO, + description="Unexpected Max Read Request Size detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_max_rd_req_size": max_rd_req_size, + "expected_max_rd_req_size": exp_max_rd_req_size, + }, + ) + + def check_pcie_dev_ctrl_2_reg( + self, + bdf_pcie_express_dict: Dict[str, PcieExp], + exp_ten_bit_tag_req_en: Optional[int], + ): + """Checks 7.5.3.16 Device Control 2 Register (Offset 28h) fields for expected value: + - 10-bit Tag Request Enable + + Args: + bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]): + dict of key bdf and value PcieCfgSpace object which contains register data + exp_ten_bit_tag_req_en (Optional[int]): expected 10-bit tag request enable, when None it is not checked + Returns: + None + """ + for bdf, pcie_exp in bdf_pcie_express_dict.items(): + dev_ctrl_2_reg = pcie_exp.dev_ctrl_2_reg + ten_bit_tag_req_en = dev_ctrl_2_reg.ten_bit_tag_req_en.get_val() + if exp_ten_bit_tag_req_en is not None and ten_bit_tag_req_en != exp_ten_bit_tag_req_en: + self._log_event( + category=EventCategory.IO, + description="Unexpected 10-bit Tag Request Enable detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "current_ten_bit_tag_req_en": ten_bit_tag_req_en, + "expected_ten_bit_tag_req_en": exp_ten_bit_tag_req_en, + }, + ) + + def instantaneous_par_err_chk(self, bdf_cfg_space_dict: Dict[str, ECap16Gt]): + """Instantaneous parity error check for ECap16Gt registers, will + log an event if any lanes have parity errors. + + Parameters + ---------- + bdf_cfg_space_dict : Dict[str, ECap16Gt] + Dictionary of BDFs and their corresponding ECap16Gt capability structure + """ + for bdf, ecap_pl_16gt in bdf_cfg_space_dict.items(): + par_mismatch_stat: ParityMisMatchStat16GT = ecap_pl_16gt.parity_mismatch_stat + retimer_fst_par_mismatch_stat = ecap_pl_16gt.retimer_fst_parity_mismatch_stat + for parity_register in [ + par_mismatch_stat, + retimer_fst_par_mismatch_stat, + ]: + if parity_register.val is None: + continue + par_bad_lanes = [ + 1 if (parity_register.val >> bit) & 1 else 0 for bit in range(0, 32) + ] + number_of_bad_lanes = sum(par_bad_lanes) + if number_of_bad_lanes > 0: + self._log_event( + category=EventCategory.IO, + description="Lanes have parity errors", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "reg_name": parity_register.__class__.__name__, + "reg_desc": parity_register.desc, + "register_value": parity_register.val, + "number_of_bad_lanes": number_of_bad_lanes, + }, + ) + + def lane_error_status_chk(self, ecap_sec_pci_dict: Dict[str, ECapSecpci]): + """Lane error status check for ECapSecpci registers, will log an event if any lanes have errors. + + Parameters + ---------- + ecap_sec_pci_dict : Dict[str, ECapSecpci] + Dictionary of BDFs and their corresponding ECapSecpci capability structure + """ + for bdf, ecap_sec_pci in ecap_sec_pci_dict.items(): + lane_error_stat = ecap_sec_pci.lane_err_stat + lane_error_stat_val = lane_error_stat.val + if lane_error_stat_val != 0: + self._log_event( + category=EventCategory.IO, + description="Lane error detected", + priority=EventPriority.ERROR, + data={ + "bdf": bdf, + "reg_name": lane_error_stat.__class__.__name__, + "register_value": lane_error_stat_val, + }, + ) + + def device_consistancy_chk(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]): + """Checks that the configurable fields in the PCIe devices are all consistent""" + # Build a dynamic map of device IDs to BDFs from the actual devices in the system + dev_id_bdf_map: Dict[int, List[BdfStr]] = {} + + for bdf, cfg_space in bdf_cfg_space_dict.items(): + # Collect Unique device Ids contained in this system + device_id = cfg_space.type_0_configuration.device_id.val + if device_id is None: + self._log_event( + category=EventCategory.IO, + description="No value assigned to device id, unable to check consistency due to missing data", + data={ + "bdf": bdf, + }, + priority=EventPriority.WARNING, + ) + continue + + # Dynamically add device IDs as we encounter them + if device_id not in dev_id_bdf_map: + dev_id_bdf_map[device_id] = [] + dev_id_bdf_map[device_id].append(bdf) + + # check the values are all equal for select registers + cap_struct_dict = self.get_valid_cap_dict(bdf_cfg_space_dict, PcieExp, log_event=False) + for collected_device_id, list_of_bdfs in dev_id_bdf_map.items(): + # check the values are all equal for select registers + mps = [] + mrs = [] + tbt = [] + log_event = False + for bdf in list_of_bdfs: + if bdf not in cap_struct_dict: + # Missing Capability structure for this BDF, skip it, log event at end + log_event = True + continue + pcie_exp = cap_struct_dict[bdf] + dev_ctrl_reg = pcie_exp.dev_ctrl_reg + mps.append(dev_ctrl_reg.mps.val) + mrs.append(dev_ctrl_reg.max_rd_req_size.val) + tbt.append(dev_ctrl_reg.ext_tag_field_en.val) + # check the values are all equal for select registers + if len(set(mps)) > 1 or len(set(mrs)) > 1 or len(set(tbt)) > 1 or log_event: + collected_device_id_str = hex(collected_device_id) + self._log_event( + category=EventCategory.IO, + description=f"PCIe device {collected_device_id_str} has inconsistent values", + priority=EventPriority.WARNING, + data={ + "dev_id": collected_device_id_str, + "bdf_list": list_of_bdfs, + "max_payload_size_list": mps, + "max_rd_req_size_list": mrs, + "ext_tag_field_en_list": tbt, + }, + ) + + def check_ecap_16gt_regs( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + ): + """Acquires ECap16Gt capability structure and checks for instantaneous parity errors""" + CAP_STRUCTURE = ECap16Gt + bdf_ecap_16gt_dict = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + self.instantaneous_par_err_chk(bdf_cfg_space_dict=bdf_ecap_16gt_dict) + + def check_ecap_sec_pci_regs( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + ): + """Acquires ECapSecpci capability structure and checks for lane errors""" + CAP_STRUCTURE = ECapSecpci + bdf_ecap_secondary_pci = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + self.lane_error_status_chk(ecap_sec_pci_dict=bdf_ecap_secondary_pci) + + def check_ecap_aer_errors( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + ): + """Acquires ECapAer capability structure and checks for AER errors""" + CAP_STRUCTURE = ECapAer + bdf_ecap_aer_error = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + self.check_uncorr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error) + self.check_corr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error) + + def check_pcie_exp_capability_structure_errors( + self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace] + ): + """Checks the PCIe Express capability structure for errors""" + CAP_STRUCTURE = PcieExp + bdf_pcie_express_dict = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=False + ) + self.check_pcie_device_status_errors(bdf_pcie_express_dict=bdf_pcie_express_dict) + + def check_pcie_exp_capability_structure_config( + self, + bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace], + exp_max_payload_size: Optional[int] = None, + exp_max_rd_req_size: Optional[int] = None, + exp_ten_bit_tag_req_en: Optional[int] = None, + ): + """Checks the PCIe Express capability structure for errors""" + CAP_STRUCTURE = PcieExp + + bdf_pcie_express_dict = self.get_valid_cap_dict( + bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True + ) + + if exp_max_payload_size is not None or exp_max_rd_req_size is not None: + self.check_pcie_dev_ctrl_reg( + bdf_pcie_express_dict=bdf_pcie_express_dict, + exp_max_payload_size=exp_max_payload_size, + exp_max_rd_req_size=exp_max_rd_req_size, + ) + + if exp_ten_bit_tag_req_en is not None: + self.check_pcie_dev_ctrl_2_reg( + bdf_pcie_express_dict=bdf_pcie_express_dict, + exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en, + ) + + @staticmethod + def filter_pcie_data_by_device_id( + bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace], + device_ids: Set[int], + ) -> Dict[BdfStr, PcieCfgSpace]: + """Filters the PCIe data by device ID + + Parameters + ---------- + device_ids : set[int] + Set of device IDs to filter by + + Returns + ------- + Dict[BdfStr, PcieCfgSpace] + Dictionary of BDFs and their corresponding PCIe configuration space + """ + new_cfg_space_dict: Dict[BdfStr, PcieCfgSpace] = {} + for bdf, pcie_data in bdf_cfg_space_dict.items(): + dev_id = pcie_data.type_0_configuration.device_id.val + if dev_id in device_ids: + new_cfg_space_dict[bdf] = pcie_data + return new_cfg_space_dict + + def check_gpu_count( + self, + pcie_data: PcieDataModel, + expected_gpu_count: Optional[int] = None, + ): + """Check if GPU count from PCIe data matches expected count + + Parameters + ---------- + pcie_data : PcieDataModel + PCIe data model containing collected PCIe configuration space data + expected_gpu_count : Optional[int], optional + Expected GPU count, by default None (no check performed) + """ + if expected_gpu_count is None: + return + + gpu_count_from_pcie = 0 + for cfg_space in pcie_data.pcie_cfg_space.values(): + vendor_id = cfg_space.type_0_configuration.vendor_id.val + if vendor_id == self.system_info.vendorid_ep: + gpu_count_from_pcie += 1 + + if gpu_count_from_pcie != expected_gpu_count: + self._log_event( + category=EventCategory.IO, + description="GPU count mismatch", + priority=EventPriority.ERROR, + data={ + "gpu_count_from_pcie": gpu_count_from_pcie, + "expected_gpu_count": expected_gpu_count, + }, + ) + else: + self._log_event( + category=EventCategory.IO, + description="GPU count matches expected", + priority=EventPriority.INFO, + data={ + "gpu_count": gpu_count_from_pcie, + }, + ) + + def analyze_data( + self, data: PcieDataModel, args: Optional[PcieAnalyzerArgs] = None + ) -> TaskResult: + """Check PCIe data for errors by analyzing the PCIe register space and + checking the enumeration of the GPUs and optional SR-IOV VFs + + Parameters + ---------- + data : PcieDataModel + PCIe data model containing collected PCIe configuration space data + args : Optional[PcieAnalyzerArgs], optional + Analyzer arguments containing expected values for validation, by default None + + Returns + ------- + TaskResult + Result of the analysis + """ + if args is None: + args = PcieAnalyzerArgs() + + exp_speed = args.exp_speed + exp_width = args.exp_width + exp_sriov_count = args.exp_sriov_count + exp_gpu_count_override = args.exp_gpu_count_override + exp_max_payload_size = normalize_to_dict( + args.exp_max_payload_size, self.system_info.vendorid_ep + ) + exp_max_rd_req_size = normalize_to_dict( + args.exp_max_rd_req_size, self.system_info.vendorid_ep + ) + exp_ten_bit_tag_req_en = normalize_to_dict( + args.exp_ten_bit_tag_req_en, self.system_info.vendorid_ep + ) + try: + pcie_input_data = PcieAnalyzerInputModel( + exp_speed=exp_speed, + exp_width=exp_width, + exp_sriov_count=exp_sriov_count, + exp_gpu_count_override=exp_gpu_count_override, + exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en, + exp_max_payload_size=exp_max_payload_size, + exp_max_rd_req_size=exp_max_rd_req_size, + ) + except ValidationError as val_error: + self._log_event( + category=EventCategory.RUNTIME, + description="User input for PcieAnalyzerModel is invalid", + priority=EventPriority.ERROR, + data={ + "validation_error": get_exception_traceback(val_error), + "valid_input": { + "exp_speed": "int, 1-5", + "exp_width": "int, 1-16", + "exp_sriov_count": "Optional[int]", + "exp_gpu_count_override": "Optional[int]", + }, + "actual_input": { + "exp_speed": exp_speed, + "exp_width": exp_width, + "exp_sriov_count": exp_sriov_count, + "exp_gpu_count_override": exp_gpu_count_override, + }, + }, + ) + return self.result + + pcie_data: PcieDataModel = data + + if pcie_data.pcie_cfg_space == {} and pcie_data.vf_pcie_cfg_space == {}: + # If both of the PCIe Configuration spaces are + self._log_event( + category=EventCategory.IO, + description="No PCIe config space found", + priority=EventPriority.WARNING, + ) + return self.result + + # Check every link in the PCIe configuration space for the expected capability structure, + # but don't check VF since those will be 0 + bdf_pcie_express_dict = self.get_valid_cap_dict( + pcie_data.pcie_cfg_space, + PcieExp, + log_event=True, + ) + self.check_link_status( + bdf_pcie_express_dict=bdf_pcie_express_dict, + exp_speed=exp_speed, + exp_width=exp_width, + ) + + amd_device_ids = set() + for cfg_space in pcie_data.pcie_cfg_space.values(): + vendor_id = cfg_space.type_0_configuration.vendor_id.val + device_id = cfg_space.type_0_configuration.device_id.val + if vendor_id == self.system_info.vendorid_ep and device_id is not None: + amd_device_ids.add(device_id) + + # Filter PCIe data for AMD GPUs + oam_pcie_data = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.pcie_cfg_space, + device_ids=amd_device_ids, + ) + + amd_vf_device_ids = set() + if pcie_data.vf_pcie_cfg_space is not None: + for cfg_space in pcie_data.vf_pcie_cfg_space.values(): + vendor_id = cfg_space.type_0_configuration.vendor_id.val + device_id = cfg_space.type_0_configuration.device_id.val + if vendor_id == self.system_info.vendorid_ep and device_id is not None: + amd_vf_device_ids.add(device_id) + + oam_vf_pcie_data = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.vf_pcie_cfg_space, + device_ids=amd_vf_device_ids, + ) + else: + oam_vf_pcie_data = {} + + # Include bridge/retimer devices (0x1500, 0x1501) + us_ds_retimer = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.pcie_cfg_space, + device_ids={0x1500, 0x1501}, + ) + ubb_data = {**oam_pcie_data, **us_ds_retimer} + ubb_data_with_vf = {**ubb_data, **oam_vf_pcie_data} + # Type 0 Configuration Space Checks + self.check_pcie_status_errors(bdf_cfg_space_dict=ubb_data_with_vf) + # Check other capability structures + dev_ids = set( + list(pcie_input_data.exp_max_payload_size.keys()) + + list(pcie_input_data.exp_max_rd_req_size.keys()) + + list(pcie_input_data.exp_ten_bit_tag_req_en.keys()) + ) + for device_id_to_check in dev_ids: + cfg_space_filtered = self.filter_pcie_data_by_device_id( + bdf_cfg_space_dict=pcie_data.pcie_cfg_space, + device_ids={device_id_to_check}, + ) + self.check_pcie_exp_capability_structure_config( + cfg_space_filtered, + pcie_input_data.exp_max_payload_size.get(device_id_to_check), + pcie_input_data.exp_max_rd_req_size.get(device_id_to_check), + pcie_input_data.exp_ten_bit_tag_req_en.get(device_id_to_check), + ) + + # run with vfs for AERs and PCIe EXP errors + self.check_pcie_exp_capability_structure_errors(bdf_cfg_space_dict=ubb_data_with_vf) + self.check_ecap_aer_errors(bdf_cfg_space_dict=ubb_data_with_vf) + self.check_ecap_16gt_regs(bdf_cfg_space_dict=ubb_data) + self.check_ecap_sec_pci_regs(bdf_cfg_space_dict=ubb_data) + + if amd_device_ids: + self.device_consistancy_chk( + bdf_cfg_space_dict=ubb_data, + ) + else: + self._log_event( + category=EventCategory.RUNTIME, + description="No AMD GPU devices found, skipping device consistency check", + priority=EventPriority.INFO, + ) + + self.check_gpu_count(pcie_data, exp_gpu_count_override) + + return self.result diff --git a/nodescraper/plugins/inband/pcie/pcie_collector.py b/nodescraper/plugins/inband/pcie/pcie_collector.py new file mode 100755 index 00000000..c6d3e624 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_collector.py @@ -0,0 +1,690 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import re +from enum import Enum +from typing import Dict, List, Optional, Set, Tuple, Union + +from pydantic import ValidationError + +from nodescraper.base import InBandDataCollector +from nodescraper.connection.inband import TextFileArtifact +from nodescraper.enums import ( + EventCategory, + EventPriority, + ExecutionStatus, + OSFamily, + SystemInteractionLevel, +) +from nodescraper.models import TaskResult +from nodescraper.utils import get_all_subclasses, get_exception_details + +from .pcie_data import ( + MAX_CAP_ID, + MAX_ECAP_ID, + CapabilityEnum, + ExtendedCapabilityEnum, + PcieCapStructure, + PcieCfgSpace, + PcieDataModel, + Type0Configuration, + Type1Configuration, +) + + +class PcieCollector(InBandDataCollector[PcieDataModel, None]): + """class for collection of PCIe data only supports Linux OS type. + + This class collects the PCIE config space using the lspci hex dump and then parses the hex dump to get the + PCIe configuration space for the GPUs in the system. If the system interaction level is set to STANDARD or higher, + then the entire pcie configuration space is collected for the GPUs in the system. If the system interaction level + is set to SURFACE then, only the first 64 bytes of the pcie configuration space is collected for the GPUs in the system. + + This class will collect important PCIe data from the system running the commands + - `lspci -vvv` : Verbose collection of PCIe data + - `lspci -vt`: Tree view of PCIe data + - `lspci -PP`: Path view of PCIe data for the GPUs + - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo: + - `lspci -xxxx`: Hex view of PCIe data for the GPUs + - otherwise the following commands will be run without sudo: + - `lspci -x`: Hex view of PCIe data for the GPUs + - `lspci -d :` : Count the number of GPUs in the system with this command + - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo: + - The sudo lspci -xxxx command is used to collect the PCIe configuration space for the GPUs in the system + - otherwise the following commands will be run without sudo: + - The lspci -x command is used to collect the PCIe configuration space for the GPUs in the system + + """ + + SUPPORTED_OS_FAMILY: Set[OSFamily] = {OSFamily.LINUX} + + DATA_MODEL = PcieDataModel + + CMD_LSPCI_VERBOSE = "lspci -vvv" + CMD_LSPCI_TREE = "lspci -vt" + CMD_LSPCI_PATH = "lspci -PP" + CMD_LSPCI_HEX_SUDO = "lspci -xxxx" + CMD_LSPCI_HEX = "lspci -x" + CMD_LSPCI_AMD_DEVICES = "lspci -d {vendor_id}: -nn" + CMD_LSPCI_PATH_DEVICE = "lspci -PP -d {vendor_id}:{dev_id}" + + def _detect_amd_device_ids(self) -> dict[str, list[str]]: + """Detect AMD GPU device IDs from the system using lspci. + + Returns: + dict[str, list[str]]: Dictionary with 'vendor_id', 'device_ids', and 'vf_device_ids' + """ + vendor_id_hex = format(self.system_info.vendorid_ep, "x") + result: dict[str, list[str]] = { + "vendor_id": [vendor_id_hex], + "device_ids": [], + "vf_device_ids": [], + } + + res = self._run_sut_cmd( + self.CMD_LSPCI_AMD_DEVICES.format(vendor_id=vendor_id_hex), + sudo=False, + log_artifact=False, + ) + if res.exit_code == 0 and res.stdout: + # Pattern: [vendor:device] + device_id_pattern = rf"\[{vendor_id_hex}:([0-9a-fA-F]{{4}})\]" + # Pattern to detect VF in description + vf_pattern = r"Virtual Function" + + for line in res.stdout.splitlines(): + matches = re.findall(device_id_pattern, line) + if matches: + device_id = matches[0].lower() + # Check if it's a VF + if re.search(vf_pattern, line, re.IGNORECASE): + if device_id not in result["vf_device_ids"]: + result["vf_device_ids"].append(device_id) + self.logger.info(f"Detected AMD VF device ID: {device_id}") + else: + if device_id not in result["device_ids"]: + result["device_ids"].append(device_id) + self.logger.info(f"Detected AMD device ID: {device_id}") + + self._log_event( + category=EventCategory.IO, + description="Detected AMD GPU device IDs from system", + data=result, + priority=EventPriority.INFO, + ) + + return result + + def show_lspci_verbose(self, sudo=True) -> Optional[str]: + """Show lspci with -vvv.""" + return self._run_os_cmd(self.CMD_LSPCI_VERBOSE, sudo=sudo) + + def show_lspci_verbose_tree(self, sudo=True) -> Optional[str]: + """Show lspci with -vt.""" + return self._run_os_cmd(self.CMD_LSPCI_TREE, sudo=sudo) + + def show_lspci_path(self, sudo=True) -> Optional[str]: + """Show lspci with -PP.""" + return self._run_os_cmd(self.CMD_LSPCI_PATH, sudo=sudo) + + def show_lspci_hex(self, bdf: Optional[str] = None, sudo=True) -> Optional[str]: + """Show lspci with -xxxx.""" + if sudo: + hex_arg = "-xxxx" + else: + # Sudo required for whole pcie configuration space + hex_arg = "-x" + + if bdf: + return self._run_os_cmd(f"lspci {hex_arg} -s {bdf}", sudo=sudo) + return self._run_os_cmd(f"lspci {hex_arg}", sudo=sudo) + + def _run_os_cmd( + self, command: str, sudo: bool = True, ignore_error: bool = False + ) -> Optional[str]: + """Run os command. Run as sudo by default. + + Args: + command (str): command to run on the OS + sudo (bool): run as sudo or not + ignore_error (bool): ignore error or not + Returns: + stdout: str + """ + cmd_ret = self._run_sut_cmd(command, sudo=sudo) + if ignore_error: + return cmd_ret.stdout + elif cmd_ret.stderr != "" or cmd_ret.exit_code != 0: + return None + else: + return cmd_ret.stdout + + def _get_upstream_bdf_from_buspath( + self, + vendor_id: str, + dev_id: str, + upstream_steps_limit: Optional[int] = 0, + sudo=True, + ) -> Optional[Dict[str, List[str]]]: + """Get all the upstream BDFs for a vendor/device id. + + Parameters + ---------- + vendor_id : str + A pcie vendor id + dev_id : str + A pcie device id + upstream_steps_limit : Optional[int] + The limit on the number of upstream devices to collect, by default 0 + sudo : bool + Run the command as sudo or not, by default True + + Returns + ------- + Optional[List[str]] + A list of upstream BDFs or None on failure + """ + split_bdf_pos = 0 + + bus_path_all_gpus = self._run_os_cmd(f"lspci -PP -d {vendor_id}:{dev_id}", sudo=sudo) + if bus_path_all_gpus is None or bus_path_all_gpus == "": + self._log_event( + category=EventCategory.IO, + description="Failed to get bus path info for vendor/device ID.", + data={"vendor_id": vendor_id, "dev_id": dev_id}, + priority=EventPriority.INFO, + ) + return None + upstream_bdfs: Dict[str, List[str]] = {} + for bus_path in bus_path_all_gpus.splitlines(): + bus_path_list = (bus_path.split(" ")[split_bdf_pos]).split("/") + if upstream_steps_limit is not None and len(bus_path_list) < upstream_steps_limit + 1: + # We don't have enough upstream devices to collect + self._log_event( + category=EventCategory.RUNTIME, + description="Not enough upstream devices found.", + data={ + "bus_path": bus_path, + "upstream_steps_limit": upstream_steps_limit, + "bus_path_list": bus_path_list, + }, + priority=EventPriority.WARNING, + ) + bdf_str = bus_path_list[-1] + upstream_bdfs[bdf_str] = [] + # Flip the bus_path_list to get GPU first and then upstream devices + bus_path_list.reverse() + # Upstream + 1 to always include GPU and # of upstream devices + if upstream_steps_limit is None: + upstream_bdfs[bdf_str] = bus_path_list + else: + for bdf in range(min(len(bus_path_list), upstream_steps_limit + 1)): + upstream_bdfs[bdf_str].append(bus_path_list[bdf]) + + return upstream_bdfs + + def _get_gpu_cfg_space( + self, + vendor_id: str, + device_id: str, + upstream_steps_from_gpu: Optional[int] = 0, + sudo=True, + ) -> dict[str, PcieCfgSpace]: + """ + - Generates a nested dictionary with the PCIe configuration space for the bdfs corresponding to the vendor/device ID + - Populates the dict by reading cfg space through 'setpci' commands + + Args: + vendor_id (str): vendor ID (hex format) + device_id (str): device ID (hex format) + upstream_steps_from_gpu (Optional[int]): The number of upstream devices to collect the PCIe cfg space for, by default 0 + Returns: + all_bdf_cfg_space_dict: nested dictionary containing PCIe cfg space for all bdfs corresponding to the vendor/device ID + """ + if (vendor_id is None) or (device_id is None): + self._log_event( + category=EventCategory.IO, + description="System info is invalid Vendor ID or Device ID is None.", + data={"vendor_id": vendor_id, "dev_id": device_id}, + priority=EventPriority.ERROR, + ) + return {} + + bdf_list = self._get_upstream_bdf_from_buspath( + vendor_id, + device_id, + upstream_steps_limit=upstream_steps_from_gpu, + sudo=sudo, + ) + if bdf_list is None: + return {} + + all_bdf_cfg_space_dict = {} + for gpu_bdf_list in bdf_list.values(): + for bdf in gpu_bdf_list: + new_base_dict = self.get_cfg_by_bdf(bdf, sudo=sudo) + all_bdf_cfg_space_dict[bdf] = new_base_dict + return all_bdf_cfg_space_dict + + def parse_hex_dump(self, hex_dump: str) -> list[int]: + """Parse the hex dump.""" + + hex_dump = hex_dump.strip() + byte_list = [] + for line in hex_dump.splitlines(): + parts = line.split(":") + if len(parts) != 2: + continue # Skip malformed lines + if len(parts[1]) != 48: + continue # Unexpected number of bytes + byte_str = parts[1] + tokens = byte_str.strip().split() + for token in tokens: + byte = int(token, 16) + byte_list.append(byte) + + return byte_list + + def read_register(self, width: int, offset: int, config_data: List[int]): + """Read a register from the hex dump, width should be 1, 2, 4, or 8 bytes""" + register_value = 0 + for i in range(0, width >> 3): + register_value += config_data[offset + i] << (i * 8) + return register_value + + def extended_cap_finder( + self, + config_data: List[int], + cap_pointer: int, + cap_data: Optional[Dict[int, int]] = None, + ): + """Obtain capability structure by parsing the hex dump for capability pointers + + config_data : List[int] + A list of int's representing the hex dump from lspci -x or sudo lspci -xxxx + cap_pointer : int + The hex value of a Capability pointer or 0x34 for the first cap pointer + cap_data : Optional[dict[int, int]], optional + A dictionary of capability pointers, by default None + + returns + ------- + cap_data : Dict[int, int] + A list of capability pointers, key is the cap_id and value is the cap_pointer use CapabilityEnum(cap_id) to get the Name + """ + if cap_data is None: + cap_data = {} + if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data): + # prevent an illegal access to the list + return cap_data + cap_id = config_data[cap_pointer] + (config_data[cap_pointer + 1] << 8) + if cap_id > MAX_ECAP_ID: + # Break if the cap_id is greater than the max extended cap id + self._log_event( + category=EventCategory.IO, + description=f"Invalid Capability ID detected {cap_id}", + priority=EventPriority.ERROR, + data={"cap_id": cap_id}, + ) + return {} + cap_data[cap_id] = cap_pointer + if cap_pointer + 3 >= len(config_data): + return cap_data + next_cap_pointer = (config_data[cap_pointer + 2] & 0xF0) >> 4 + next_cap_pointer += config_data[cap_pointer + 3] << 4 + if next_cap_pointer == 0: + return cap_data + else: + return self.extended_cap_finder(config_data, next_cap_pointer, cap_data) + + def cap_finder( + self, + config_data: List[int], + cap_pointer: int, + cap_data: Optional[Dict[int, int]] = None, + ): + """Obtain capability structure by parsing the hex dump for capability pointers + + Parameters + ---------- + config_data : List[int] + A list of int's representing the hex dump from lspci -xxxx + cap_pointer : int + The hex value of a Capability pointer or 0x34 for the first cap pointer + cap_data : Optional[Dict[int, int]], optional + A dictionary of capability pointers, by default None + + Returns + ------- + cap_data : Dict[int, int] + A list of extended apability pointers, key is the cap_id and value is the cap_pointer use ExtendedCapabilityEnum(cap_id) to get the Name + """ + if cap_data is None: + cap_data = {} + + if cap_pointer == 0x34: + # Special case for ths first cap pointer, this one doesn't have an associated cap_id so just move on + return self.cap_finder(config_data, config_data[0x34], cap_data) + if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data): + # prevent an illegal access to the list + return cap_data + cap_id = config_data[cap_pointer] + if cap_id > MAX_CAP_ID: + # Break if the cap_id is greater than the max cap id + self._log_event( + category=EventCategory.IO, + description=f"Invalid Capability ID detected {cap_id}", + priority=EventPriority.ERROR, + data={"cap_id": cap_id}, + ) + return {} + next_cap_pointer = config_data[cap_pointer + 1] + cap_data[cap_id] = cap_pointer + if next_cap_pointer == 0: + return cap_data + else: + return self.cap_finder(config_data, next_cap_pointer, cap_data) + + def get_cap_struct(self, id: Enum) -> Optional[type[PcieCapStructure]]: + for cap_struct in get_all_subclasses(PcieCapStructure): + if cap_struct.cap_id == id: + return cap_struct + return None + + def get_pcie_common_cfg( + self, + type_x_configuration: Union[type[Type0Configuration], type[Type1Configuration]], + config_data: List[int], + ) -> Union[Type0Configuration, Type1Configuration]: + """Get the Base PCIe configuration space from the hex dump items + + Parameters + ---------- + type_x_configuration : Union[type[Type0Configuration], type[Type1Configuration]] + Either Type0Configuration or Type1Configuration + config_data : List[int] + Config data from lspci -xxxx + + Returns + ------- + Union[Type0Configuration, Type1Configuration] + The complete model that was input + """ + register_data: Dict[str, int] = {} + type_x_obj = type_x_configuration() + for register_name, register_in in type_x_obj.iter_regs(): + register = register_in.model_copy() + register_data[register_name] = self.read_register( + register.width, register.offset, config_data + ) + type_x_obj.set_regs(register_data) + return type_x_obj + + def get_cap_cfg( + self, + cap_data: Dict[int, int], + config_data: List[int], + ) -> Union[ + Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure] + ]: + """Get the data from the capability structures + + Parameters + ---------- + cap_data : Dict[int,int] + A list of capability pointers, key is the cap_id and value is the cap_pointer + config_data : List[int] + A list of ints representing the hex dump from lspci -xxxx + + Returns + ------- + Union[Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure]] + Either a dict of CapabilityEnum to PcieCapStructure or ExtendedCapabilityEnum to PcieCapStructure + + """ + cap_structure: Dict[Enum, PcieCapStructure] = {} + for cap_id, cap_addr in cap_data.items(): + if cap_id == 0: + continue + if cap_addr >= 0x100: + cap_enum: Enum = ExtendedCapabilityEnum(cap_id) + else: + cap_enum = CapabilityEnum(cap_id) + cap_cls = self.get_cap_struct(cap_enum) + if cap_cls is None: + continue + cap_obj = cap_cls() # type: ignore[call-arg] + reg_data = {} + for register_name, register in cap_obj.iter_regs(): + reg_data[register_name] = self.read_register( + register.width, register.offset + cap_addr, config_data + ) + cap_obj.set_regs(reg_data) + cap_obj.offset = cap_addr + cap_structure[cap_enum] = cap_obj + + return cap_structure # type: ignore[return-value] + + def get_cfg_by_bdf(self, bdf: str, sudo=True) -> PcieCfgSpace: + """Will fill out a PcieCfgSpace object with the PCIe configuration space for a given BDF""" + hex_data_raw = self.show_lspci_hex(bdf, sudo=sudo) + if hex_data_raw is None: + self._log_event( + category=EventCategory.IO, + description="Failed to get hex data for BDF.", + data={"bdf": bdf}, + priority=EventPriority.ERROR, + ) + return PcieCfgSpace() + hex_data: List[int] = self.parse_hex_dump(hex_data_raw) + if len(hex_data) < 64: + # Expect at least 256 bytes of data, for the first 256 bytes of the PCIe config space + self._log_event( + category=EventCategory.IO, + description="Hex data is not the expected length", + data={"bdf": bdf, "length": len(hex_data)}, + priority=EventPriority.ERROR, + ) + return PcieCfgSpace() + cap_data, ecap_data = self.discover_capability_structure(hex_data) + return self.get_pcie_cfg(hex_data, cap_data, ecap_data) + + def get_pcie_cfg( + self, + config_data: List[int], + cap_data: Dict[int, int], + ecap_data: Dict[int, int], + ) -> PcieCfgSpace: + """Gets the pcie config space from a list of ints + + Parameters + ---------- + config_data : List[int] + A list of ints representing the hex dump from lspci -xxxx + cap_data : Dict[int, int] + A list of capability pointers, key is the cap_id and value is the cap_pointer + + Returns + ------- + PcieCfgSpace + A PcieCfgSpace object with the PCIe configuration + """ + type0 = self.get_pcie_common_cfg(Type0Configuration, config_data) + type1 = self.get_pcie_common_cfg(Type1Configuration, config_data) + cap = self.get_cap_cfg(cap_data, config_data) + ecap = self.get_cap_cfg(ecap_data, config_data) + return PcieCfgSpace( + type_0_configuration=type0, # type: ignore[arg-type] + type_1_configuration=type1, # type: ignore[arg-type] + capability_pointers=cap_data, # type: ignore[arg-type] + extended_capability_pointers=ecap_data, # type: ignore[arg-type] + cap_structure=cap, # type: ignore[arg-type] + ecap_structure=ecap, # type: ignore[arg-type] + ) + + def _log_pcie_artifacts( + self, + lspci_pp: Optional[str], + lspci_hex: Optional[str], + lspci_tree: Optional[str], + lspci_verbose: Optional[str], + ): + """Log the file artifacts for the PCIe data collector.""" + name_log_map = { + "lspci_hex.txt": lspci_hex, + "lspci_tree.txt": lspci_tree, + "lspci_verbose.txt": lspci_verbose, + "lspci_pp.txt": lspci_pp, + } + for name, data in name_log_map.items(): + if data is not None: + self.result.artifacts.append(TextFileArtifact(filename=name, contents=data)) + + def _get_pcie_data( + self, upstream_steps_to_collect: Optional[int] = None + ) -> Optional[PcieDataModel]: + """Will return all PCIe data in a PcieDataModel object. + + Returns + ------- + Optional[PcieDataModel] + The data in a PcieDataModel object or None on failure + """ + minimum_system_interaction_level_required_for_sudo = SystemInteractionLevel.INTERACTIVE + + try: + if ( + isinstance(self.system_interaction_level, SystemInteractionLevel) + and self.system_interaction_level + >= minimum_system_interaction_level_required_for_sudo + ): + use_sudo = True + else: + use_sudo = False + + if upstream_steps_to_collect is None: + upstream_steps_to_collect = None + + # Detect AMD device IDs dynamically from the system + detected_devices = self._detect_amd_device_ids() + vendor_id = ( + detected_devices["vendor_id"][0] + if detected_devices["vendor_id"] + else format(self.system_info.vendorid_ep, "x") + ) + device_ids = detected_devices["device_ids"] + vf_device_ids = detected_devices["vf_device_ids"] + + pcie_cfg_dict: Dict[str, PcieCfgSpace] = {} + vf_pcie_cfg_data: Dict[str, PcieCfgSpace] = {} + + # Collect PCIe config space for each detected device ID + for dev_id in device_ids: + cfg_space = self._get_gpu_cfg_space( + vendor_id=vendor_id, + device_id=dev_id, + upstream_steps_from_gpu=upstream_steps_to_collect, + sudo=use_sudo, + ) + if cfg_space: + pcie_cfg_dict.update(cfg_space) + + # Collect VF PCIe config space for each detected VF device ID + for dev_id_vf in vf_device_ids: + vf_cfg_space = self._get_gpu_cfg_space( + vendor_id=vendor_id, + device_id=dev_id_vf, + upstream_steps_from_gpu=0, + sudo=use_sudo, + ) + if vf_cfg_space: + vf_pcie_cfg_data.update(vf_cfg_space) + + lspci_hex = self.show_lspci_hex(sudo=use_sudo) + lspci_verbose = self.show_lspci_verbose(sudo=use_sudo) + lspci_verbose_tree = self.show_lspci_verbose_tree(sudo=use_sudo) + lspci_path = self.show_lspci_path(sudo=use_sudo) + self._log_pcie_artifacts( + lspci_pp=lspci_path, + lspci_hex=lspci_hex, + lspci_tree=lspci_verbose_tree, + lspci_verbose=lspci_verbose, + ) + pcie_data = PcieDataModel( + pcie_cfg_space=pcie_cfg_dict, + vf_pcie_cfg_space=vf_pcie_cfg_data, + ) + except ValidationError as e: + self._log_event( + category=EventCategory.OS, + description="Failed to build model for PCIe data", + data=get_exception_details(e), + priority=EventPriority.ERROR, + ) + self.result.status = ExecutionStatus.ERROR + return None + return pcie_data + + def discover_capability_structure( + self, hex_dump: List[int] + ) -> Tuple[Dict[int, int], Dict[int, int]]: + """Obtain the capability structure by parsing the hex dump for capability pointers + + Parameters + ---------- + hex_dump : List[int] + A list of ints from lspci -xxxx + + Returns + ------- + dict[int, int] + A list of capability pointers, key is the cap_id and value is the cap_pointer + """ + cap = self.cap_finder(hex_dump, 0x34) + ecap = self.extended_cap_finder(hex_dump, 0x100) + return cap, ecap + + def collect_data( + self, args=None, upstream_steps_to_collect: Optional[int] = None, **kwargs + ) -> Tuple[TaskResult, Optional[PcieDataModel]]: + """Read PCIe data. + + Args: + args: Optional collector arguments (not used) + upstream_steps_to_collect: Number of upstream devices to collect + **kwargs: Additional keyword arguments + + Returns: + Tuple[TaskResult, Optional[PcieDataModel]]: tuple containing the result of the task and the PCIe data if available + """ + pcie_data = self._get_pcie_data(upstream_steps_to_collect) + if pcie_data: + self._log_event( + category=EventCategory.IO, + description="PCIe Data read from GPUs", + data={"bdf_count": len(pcie_data.pcie_cfg_space.keys())}, + priority=EventPriority.INFO, + ) + return self.result, pcie_data diff --git a/nodescraper/plugins/inband/pcie/pcie_data.py b/nodescraper/plugins/inband/pcie/pcie_data.py new file mode 100644 index 00000000..77ea0e1c --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_data.py @@ -0,0 +1,2017 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from enum import Enum +from typing import ( + Annotated, + Any, + ClassVar, + Dict, + Generator, + List, + Optional, + TypeVar, + Union, +) + +from pydantic import ( + AfterValidator, + BaseModel, + SerializeAsAny, + field_serializer, + field_validator, +) + +from nodescraper.models import DataModel +from nodescraper.utils import apply_bit_mask_int + +AnyCap = TypeVar("AnyCap") + + +def validate_bdf(bdf: str) -> str: + """Validate the bus-device-function string format""" + if not isinstance(bdf, str): + raise ValueError("BDF must be a string") + # Shall only contain hex digits, `.`, `:`, and `-` + if not all(c in "0123456789abcdefABCDEF.-:" for c in bdf): + raise ValueError("BDF must only contain hex digits, '.', ':', and '-'") + # TODO: Could add more specific validation for the format, e.g., 00:00.0 + return bdf + + +BdfStr = Annotated[str, AfterValidator(validate_bdf)] + + +def field_hex_val_serializer(self, value: Optional[int], _info) -> Optional[str]: + if value is None: + return None + return str(hex(value)) + + +def field_hex_val_validator(value: Optional[str]) -> Optional[int]: + if value is None: + return None + return int(value, 16) + + +class CapabilityEnum(Enum): + """This enum holds the capability IDs for PCI Configuration Space""" + + BASE_REGISTER = 0x00 # Null Capability + PM = 0x01 # PCI Power Management Interface + AGP = 0x02 # AGP + VPD = 0x03 # VPD + SLOTID = 0x04 # Slot Identification + MSI = 0x05 # MSI + COMPACT_PCI_HS = 0x06 # CompactPCI Hot Swap + PCIX = 0x07 # PCI-X + HYPERTRANS = 0x08 # HyperTransport + VENDOR = 0x09 # Vendor-specific + DEBUG_PORT = 0x0A # Debug Port + COMPACT_PCI_CENTRAL = 0x0B # CompactPCI Central Resource Control + PCI_HP = 0x0C # PCI Hot Plug + PCI_BRIDGE = 0x0D # PCI Bridge Subsstem ID + AGP_8X = 0x0E # AGP 8x y + SECURE_DEV = 0x0F # Secure Device + PCIE_EXP = 0x10 # PCI Express + MSIX = 0x11 # MSI-X + SATA = 0x12 # Serial ATA Data/Index + AF = 0x13 # Advanced Features + EA = 0x14 # Enhanced Allocation . + FPB = 0x15 # Flattening Portal Bridge (FPB) + + +MAX_CAP_ID = max(cap_id.value for cap_id in CapabilityEnum) + + +class ExtendedCapabilityEnum(Enum): + """This enum holds the extended capability IDs for PCI Configuration Space""" + + NULL = 0x0000 # Null Capability + AER = 0x0001 # Advanced Error Reporting Extended + VCEC = 0x0002 # Virtual Channel Extended Capability + DSN = 0x0003 # Device Serial Number Extended Capability + PWR_BUDGET = 0x0004 # Power Budgeting Extended Capability + LNK_DCLR = 0x0005 # Root Complex Link Declaration Extended Capability + LNK_CEC = 0x0006 # Root Complex Internal Link Control Extended Capability + RCECOLL = 0x0007 # Root Complex Event Collector Endpoint Association Extended Capability + MFVC = 0x0008 # Multi-Function Virtual Channel Extended Capability + VC2 = 0x0009 # Virtual Channel Extended Capability + RCRB = 0x000A # RCRB Header Extended Capability + VNDR = 0x000B # Vendor-specific Extended Capability + CAC = 0x000C # Configuration Access Correlation Extended Capability + ACS = 0x000D # ACS Extended Capability + ARI = 0x000E # ARI Extended Capability (ARI) + ATS = 0x000F # ATS Extended Capability + SRIOV = 0x0010 # SR-IOV Extended Capability + MRIOV = 0x0011 # MR-IOV Extended Capability (MR-IOV) Must not implement. + MULTCAST = 0x0012 # Multicast Extended Capability + PAGE_REQ = 0x0013 # Page Request Extended Capability (PRI) + AMD = 0x0014 # Reserved for AMD + RBAR = 0x0015 # Resizable BAR Extended Capability + DPA = 0x0016 # Dynamic Power Allocation Extended Capability (DPA) + TPH = 0x0017 # TPH Requester Extended Capability + LTR = ( + 0x0018 # LTR Extended Capability . LTR is controlled using Function 0 which is never a VF. + ) + SPCI = 0x0019 # Secondary PCI Express Extended Capability + PMUX = 0x001A # PMUX Extended Capability . PMUX is controlled using Function 0 which is never a VF. + PASID = 0x001B # PASID Extended Capability + LN = 0x001C # LN Requester Extended Capability (LNR) + DPC = 0x001D # DPC Extended Capability. + L1PM = 0x001E # L1 PM Substates Extended Capability . L1 PM Substates is controlled using Function 0 which is never a VF. + PTM = 0x001F # Precision Time Management Extended Capability (PTM) + MPCIE = 0x0020 # PCI Express over M-PHY Extended Capability (M-PCIe) + FRS = 0x0021 # FRS Queueing Extended Capability + RTR = 0x0022 # Readiness Time Reporting Extended Capability + DVENDR = 0x0023 # Designated vendor-specific Extended Capability + VFBAR = 0x0024 # VF Resizable BAR Extended Capability + DLF = 0x0025 # Data Link Feature Extended Capability . + PL_16GT = 0x0026 # Physical Layer 16.0 GT/s Extended Capability + LM = 0x0027 # Lane Margining at the Receiver Extended Capability + HID = 0x0028 # Hierarchy ID Extended Capability + NPEM = 0x0029 # Native PCIe Enclosure Management Extended Capability (NPEM) + PL_32GT = 0x002A # Physical Layer 32.0 GT/s Extended Capability + ALT_PROTOCOL = 0x002B # Alternate Protocol Extended Capability + SFI = 0x002C # System Firmware Intermediary (SFI)Extended Capability + DOE = 0x2E # 0x2e Data Object Exchange + INT_DOE = 0x30 # 0x30 Integrity and Data Encryption + + +MAX_ECAP_ID = max(cap_id.value for cap_id in ExtendedCapabilityEnum) + + +class PcieBitField(BaseModel): + """Holds data about a bit field including bit_mask and description and a method to get its value""" + + bit_mask: int + desc: str + val: Optional[int] = None + + def set_val(self, reg_val: Optional[int]): + """This will apply the bitmask and shift the value to get the bit field value""" + if reg_val is None: + self.val = None + else: + self.val = apply_bit_mask_int(reg_val, self.bit_mask) + + def get_val(self) -> Optional[int]: + """Returns the value of the bit field""" + return self.val + + def apply_mask(self, reg_val) -> Optional[int]: + """This will apply the bitmask and shift the value to get the bit field value + Ex: reg_val = 0x1200, bit_mask = 0xFF00, then the value of the bit field is 0x1200 & 0xFF00 -> 0x1200 >> 8 -> 0x12 + """ + if reg_val is None: + return None + else: + return apply_bit_mask_int(reg_val, self.bit_mask) + + validate_val = field_validator("val", mode="before")(field_hex_val_validator) + serialize_val = field_serializer("val")(field_hex_val_serializer) + + +class PcieRegister(BaseModel): + """Holds data about a register including its position, width, value, bit fields and a method to get the value of a bit field + setpci_name is the name of the register in setpci output --dumpregs""" + + width: int + offset: int + val: Optional[int] = None + desc: str = "" + err: Optional[str] = None + + def iter_fields(self) -> Generator[tuple[str, PcieBitField], Any, None]: + """Iterator for bit fields in the register""" + for name, value in iter(self): + if isinstance(value, PcieBitField): + yield name, value + + @property + def bit_fields(self) -> dict[str, PcieBitField]: + """Get all the bit fields in the register""" + return {name: value for name, value in self.iter_fields()} + + # This will serialize the value of the register as hex + serialize_val = field_serializer("val")(field_hex_val_serializer) + + # This will validate the value of the register from hex to int + validate_val = field_validator("val", mode="before")(field_hex_val_validator) + + def __setattr__(self, name, value): + """When the value of the register is set, set all the bit fields in the register automatically + otherwise just set the value""" + if name == "val": + # set all .vals in all bitfields + for _, field in self.iter_fields(): + field.set_val(value) + super().__setattr__(name, value) + + +class PcieCapStructure(BaseModel): + """Holds the capability and extended capability info including the ID and description as well as + the registers that exists within that capability structure.""" + + cap_id: ClassVar[Enum] + desc: str + offset: int = 0 + extended: Optional[bool] = False + + def iter_regs(self) -> Generator[tuple[str, PcieRegister], Any, None]: + """Iterator for bit fields in the register""" + for name, value in iter(self): + if isinstance(value, PcieRegister): + yield name, value + + def set_regs(self, values: Dict[str, int]): + for name, value in iter(self): + if isinstance(value, PcieRegister): + value.val = values.get(name, None) + + def null_err_regs(self, filters: Optional[List[str]] = None): + """Set all registers to None, except those in the filters list""" + err_null = [] + for name, reg in self.iter_regs(): + if filters is not None: + if name in filters and (reg.val is None or reg.err is not None): + err_null.append(name) + elif filters is None: + if reg.val is None or reg.err is not None: + err_null.append(name) + return err_null + + +def cap_id_to_class( + cap_id: Union[CapabilityEnum, ExtendedCapabilityEnum], +) -> Optional[type[PcieCapStructure]]: + """Convert a generic PcieCapStructure to a Specific PcieCapStructure based on the cap_id + + Parameters + ---------- + cap_id : Union[CapabilityEnum, ExtendedCapabilityEnum] + A capability ID + + Returns + ------- + Optional[type[PcieCapStructure]] + A specific PcieCapStructure class or None if not found + """ + for cls in PcieCapStructure.__subclasses__(): + if cls.cap_id == cap_id: + return cls + return None + + +class CommandRegister(PcieRegister): + """Command Register in PCI Configuration Space""" + + offset: int = 0x04 + width: int = 16 + io_space_en: PcieBitField = PcieBitField(bit_mask=0x1, desc="I/O Space Enable") + mem_space_en: PcieBitField = PcieBitField(bit_mask=0x2, desc="Memory Space Enable") + bus_mstr_en: PcieBitField = PcieBitField(bit_mask=0x4, desc="Bus Master Enable") + spec_cyc_en: PcieBitField = PcieBitField(bit_mask=0x8, desc="Special Cycle Enable") + mem_wr_inval: PcieBitField = PcieBitField(bit_mask=0x10, desc="Memory Write and Invalidate") + vga_pal_snoop: PcieBitField = PcieBitField(bit_mask=0x20, desc="VGA Palette Snoop") + parity_err_res: PcieBitField = PcieBitField(bit_mask=0x40, desc="Parity Error Response") + idsel_step_wait_cyc_ctrl: PcieBitField = PcieBitField( + bit_mask=0x80, desc="IDSEL Stepping/Wait Cycle Control" + ) + serr_en: PcieBitField = PcieBitField(bit_mask=0x100, desc="SERR# Enable") + fast_b2b_trans_en: PcieBitField = PcieBitField( + bit_mask=0x200, desc="Fast Back-to-Back Transactions Enable" + ) + int_dis: PcieBitField = PcieBitField(bit_mask=0x400, desc="Interrupt Disable") + + +class StatusRegister(PcieRegister): + """Status Register in PCI Configuration Space""" + + offset: int = 0x06 + width: int = 16 + desc: str = "Status Register" + immed_readiness: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Immediate Readiness") + int_stat: PcieBitField = PcieBitField(bit_mask=(1 << 3), desc="Interrupt Status") + cap_list: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="Capabilities List") + sixty_six_mhz_cap: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="66 MHz Capable") + fast_b2b_trans_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="Fast Back-to-Back Transactions Capable" + ) + mstr_data_par_err: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Master Data Parity Error" + ) + devsel_timing: PcieBitField = PcieBitField(bit_mask=(0b11 << 9), desc="DEVSEL Timing") + signaled_target_abort: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Signaled Target Abort" + ) + rcvd_target_abort: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Received Target Abort") + rcvd_mstr_abort: PcieBitField = PcieBitField(bit_mask=(1 << 13), desc="Received Master Abort") + signaled_sys_err: PcieBitField = PcieBitField(bit_mask=(1 << 14), desc="Signaled System Error") + det_parity_err: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Detected Parity Error") + + +class Type01Common(PcieCapStructure): + """Common fields for Type 01""" + + cap_id: ClassVar[Enum] = CapabilityEnum.BASE_REGISTER + desc: str = "Type 0/1 Common Configuration Space" + vendor_id: PcieRegister = PcieRegister(width=16, offset=0x00) + device_id: PcieRegister = PcieRegister(width=16, offset=0x02) + command: CommandRegister = CommandRegister() + status: StatusRegister = StatusRegister() + revision_id: PcieRegister = PcieRegister(width=8, offset=0x08) + prog_if: PcieRegister = PcieRegister(width=8, offset=0x09) + subclass: PcieRegister = PcieRegister(width=8, offset=0x0A) + class_code: PcieRegister = PcieRegister(width=8, offset=0x0B) + cache_line_size: PcieRegister = PcieRegister(width=8, offset=0x0C) + latency_timer: PcieRegister = PcieRegister(width=8, offset=0x0D) + header_type: PcieRegister = PcieRegister(width=8, offset=0x0E) + bist: PcieRegister = PcieRegister(width=8, offset=0x0F) + + +class Type0Configuration(Type01Common): + """Type 0 Specific Common Configuration Space""" + + cap_id: ClassVar[Enum] = CapabilityEnum.BASE_REGISTER + desc: str = "Type 0 Specific Common Configuration Space" + base_address_0: PcieRegister = PcieRegister( + offset=0x10, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h) / 7.5.1.3.1 Type 1 Base Address Registers (Offset 10h-14h)", + ) + base_address_1: PcieRegister = PcieRegister( + offset=0x14, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h) / 7.5.1.3.1 Type 1 Base Address Registers (Offset 10h-14h)", + ) + base_address_2: PcieRegister = PcieRegister( + offset=0x18, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + base_address_3: PcieRegister = PcieRegister( + offset=0x1C, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + base_address_4: PcieRegister = PcieRegister( + offset=0x20, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + base_address_5: PcieRegister = PcieRegister( + offset=0x24, + width=32, + desc="7.5.1.2.1 Base Address Registers (Offset 10h - 24h)", + ) + cardbus_cis: PcieRegister = PcieRegister( + offset=0x28, + width=32, + desc="7.5.1.2.2 Cardbus CIS Pointer Register (Offset 28h)", + ) + subsystem_vendor_id: PcieRegister = PcieRegister( + offset=0x2C, + width=16, + desc="7.5.1.2.3 Subsystem Vendor ID Register/Subsystem ID Register (Offset 2Ch/2Eh)", + ) + subsystem_id: PcieRegister = PcieRegister( + offset=0x2E, + width=16, + desc="7.5.1.2.3 Subsystem Vendor ID Register/Subsystem ID Register (Offset 2Ch/2Eh)", + ) + rom_address: PcieRegister = PcieRegister( + offset=0x30, + width=32, + desc="7.5.1.2.4 Expansion ROM Base Address Register (Offset 30h)", + ) + min_gnt: PcieRegister = PcieRegister( + offset=0x3E, + width=8, + desc="7.5.1.2.5 Min_Gnt Register/Max_Lat Register (Offset 3Eh/3Fh)", + ) + max_lat: PcieRegister = PcieRegister( + offset=0x3F, + width=8, + desc="7.5.1.2.5 Min_Gnt Register/Max_Lat Register (Offset 3Eh/3Fh)", + ) + + +class SecStatusRegister(PcieRegister): + """Sec Status reg for Type 1""" + + offset: int = 0x1E + width: int = 16 + desc: str = "Secondary Status Register" + sixty_six_mhz_cap: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="66 MHz Capable") + fast_b2b_trans_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="Fast Back-to-Back Transactions Capable" + ) + mstr_data_par_err: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Master Data Parity Error" + ) + devsel_timing: PcieBitField = PcieBitField(bit_mask=(0b11 << 9), desc="DEVSEL Timing") + signaled_target_abort: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Signaled Target Abort" + ) + rcvd_target_abort: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Received Target Abort") + rcvd_mstr_abort: PcieBitField = PcieBitField(bit_mask=(1 << 13), desc="Received Master Abort") + rcvd_sys_err: PcieBitField = PcieBitField(bit_mask=(1 << 14), desc="Received System Error") + det_parity_err: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Detected Parity Error") + + +class BridgeControlRegister(PcieRegister): + """Bridge controller register Specific to Type 1""" + + offset: int = 0x3E + width: int = 16 + desc: str = "7.5.1.3.13 Bridge Control Register (Offset 3Eh)" + parity_err_res_en: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Parity Error Response Enable" + ) + serr_en: PcieBitField = PcieBitField(bit_mask=(1 << 1), desc="SERR# Enable") + isa_en: PcieBitField = PcieBitField(bit_mask=(1 << 2), desc="ISA Enable") + vga_en: PcieBitField = PcieBitField(bit_mask=(1 << 3), desc="VGA Enable") + vga_16_bit_dec: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="VGA 16-bit Decode") + mstr_abort_mode: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="Master Abort Mode") + sec_bus_rst: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="Secondary Bus Reset") + fast_b2b_trans_en: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="Fast Back-to-Back Transactions Enable" + ) + primary_discard_timer: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Primary Discard Timer" + ) + sec_discard_timer: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Secondary Discard Timer" + ) + discard_timer_stat: PcieBitField = PcieBitField(bit_mask=(1 << 10), desc="Discard Timer Status") + discard_timer_serr_en: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Discard Timer SERR# Enable" + ) + + +class Type1Configuration(Type01Common): + """Type 1 Specific Common Configuration Space""" + + cap_id: ClassVar[Enum] = CapabilityEnum.BASE_REGISTER + desc: str = "Type 1 Specific Common Configuration Space" + PRIMARY_BUS: PcieRegister = PcieRegister( + offset=0x18, width=8, desc="7.5.1.3.2 Primary Bus Number Register (Offset 18h)" + ) + SECONDARY_BUS: PcieRegister = PcieRegister( + offset=0x19, + width=8, + desc="7.5.1.3.3 Secondary Bus Number Register (Offset 19h)", + ) + SUBORDINATE_BUS: PcieRegister = PcieRegister( + offset=0x1A, + width=8, + desc="7.5.1.3.4 Subordinate Bus Number Register (Offset 1Ah)", + ) + SEC_LATENCY_TIMER: PcieRegister = PcieRegister( + offset=0x1B, width=8, desc="7.5.1.3.5 Secondary Latency Timer (Offset 1Bh)" + ) + IO_BASE: PcieRegister = PcieRegister( + offset=0x1C, + width=8, + desc="7.5.1.3.6 I/O Base/I/O Limit Registers(Offset 1Ch/1Dh)", + ) + IO_LIMIT: PcieRegister = PcieRegister( + offset=0x1D, + width=8, + desc="7.5.1.3.6 I/O Base/I/O Limit Registers(Offset 1Ch/1Dh)", + ) + MEMORY_BASE: PcieRegister = PcieRegister( + offset=0x20, + width=16, + desc="7.5.1.3.8 Memory Base Register/Memory Limit Register(Offset 20h/22h)", + ) + MEMORY_LIMIT: PcieRegister = PcieRegister( + offset=0x22, + width=16, + desc="7.5.1.3.8 Memory Base Register/Memory Limit Register(Offset 20h/22h)", + ) + PREF_MEMORY_BASE: PcieRegister = PcieRegister( + offset=0x24, + width=16, + desc="7.5.1.3.9 Prefetchable Memory Base/Prefetchable Memory Limit Registers (Offset 24h/26h)", + ) + PREF_MEMORY_LIMIT: PcieRegister = PcieRegister( + offset=0x26, + width=16, + desc="7.5.1.3.9 Prefetchable Memory Base/Prefetchable Memory Limit Registers (Offset 24h/26h)", + ) + PREF_BASE_UPPER32: PcieRegister = PcieRegister( + offset=0x28, + width=32, + desc="7.5.1.3.10 Prefetchable Base Upper 32 Bits/Prefetchable Limit Upper 32 Bits Registers (Offset 28h/2Ch)", + ) + PREF_LIMIT_UPPER32: PcieRegister = PcieRegister( + offset=0x2C, + width=32, + desc="7.5.1.3.10 Prefetchable Base Upper 32 Bits/Prefetchable Limit Upper 32 Bits Registers (Offset 28h/2Ch)", + ) + IO_BASE_UPPER16: PcieRegister = PcieRegister( + offset=0x30, + width=16, + desc="7.5.1.3.11 I/O Base Upper 16 Bits/I/O Limit Upper 16 Bits Registers (Offset 30h/32h)", + ) + IO_LIMIT_UPPER16: PcieRegister = PcieRegister( + offset=0x32, + width=16, + desc="7.5.1.3.11 I/O Base Upper 16 Bits/I/O Limit Upper 16 Bits Registers (Offset 30h/32h)", + ) + BRIDGE_ROM_ADDRESS: PcieRegister = PcieRegister( + offset=0x38, + width=32, + desc="7.5.1.3.12 Expansion ROM Base Address Register (Offset 38h)", + ) + + +class CapPm(PcieCapStructure): + """Capability Structure for Power Management""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PM + desc: str = "PCI Power Management Interface (9.6 SR-IOV Power Management)" + + +class CapAgp(PcieCapStructure): + """Capability Structure for Accelerated Graphics Port""" + + cap_id: ClassVar[Enum] = CapabilityEnum.AGP + desc: str = "" + + +class CapVpd(PcieCapStructure): + """Capability Structure for Virtual Product Data""" + + cap_id: ClassVar[Enum] = CapabilityEnum.VPD + desc: str = "VPD (9.3.6.1 VPD Capability)" + + +class CapSlotid(PcieCapStructure): + """Capability Structure for Slot Identification""" + + cap_id: ClassVar[Enum] = CapabilityEnum.SLOTID + desc: str = "Slot Identification" + + +class CapMsi(PcieCapStructure): + """Capability Structure for Message Signaled Interrupts""" + + cap_id: ClassVar[Enum] = CapabilityEnum.MSI + desc: str = "7.7.1 MSI Capability Structures" + + +class CapCompatHotSwp(PcieCapStructure): + """Cap for CompactPCI Hot Swap""" + + cap_id: ClassVar[Enum] = CapabilityEnum.COMPACT_PCI_HS + desc: str = "CompactPCI Hot Swap" + + +class CapPcix(PcieCapStructure): + """Cap for PCI Extensions""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PCIX + desc: str = "PCI-X" + + +class CapHt(PcieCapStructure): + """HyperTransport Capability""" + + cap_id: ClassVar[Enum] = CapabilityEnum.HYPERTRANS + desc: str = "HyperTransport" + + +class CapVndr(PcieCapStructure): + """Vendor Specific Capability""" + + cap_id: ClassVar[Enum] = CapabilityEnum.VENDOR + desc: str = "7.9.4 Vendor-Specific Capability" + + +class CapDbg(PcieCapStructure): + """Capability for Debug Port""" + + cap_id: ClassVar[Enum] = CapabilityEnum.DEBUG_PORT + desc: str = "Debug Port" + + +class CapCompatPcieCentral(PcieCapStructure): + """Capability for CompactPCI Central Resource Control""" + + cap_id: ClassVar[Enum] = CapabilityEnum.COMPACT_PCI_CENTRAL + desc: str = "CompactPCI Central Resource Control" + + +class CapHotplug(PcieCapStructure): + """Capability for PCI Hot Plug""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PCI_HP + desc: str = "PCI Hot Plug" + + +class CapPciBridge(PcieCapStructure): + """Capability for PCI Bridge Subsystem ID""" + + cap_id: ClassVar[Enum] = CapabilityEnum.PCI_BRIDGE + desc: str = "7.9.24 Subsystem ID and Sybsystem Vendor ID Capability" + + +class CapEnhAgp(PcieCapStructure): + """Enhanced Accelerated Graphics Port (AGP) interface supporting 8x data rate.""" + + cap_id: ClassVar[Enum] = CapabilityEnum.AGP + desc: str = "AGP 8x" + + +class CapSecure(PcieCapStructure): + """Secure Device Capability""" + + cap_id: ClassVar[Enum] = CapabilityEnum.SECURE_DEV + desc: str = "Secure Device" + + +class PcieCapListReg(PcieRegister): + offset: int = 0x00 + width: int = 16 + cap_id_desc: PcieBitField = PcieBitField(bit_mask=0x00FF, desc="Capability ID") + nxt_cap_ptr: PcieBitField = PcieBitField(bit_mask=0xFF00, desc="Next Capability Pointer") + + +class DevCtrlRegister(PcieRegister): + offset: int = 0x08 + width: int = 16 + desc: str = "7.5.3.4 Device Control Register (Offset 08h)" + corr_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Correctable Error Enable" + ) + non_fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Non-fatal Error Reporting Enable" + ) + fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Fatal Error Reporting Enable" + ) + ur_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Unsupported Request Reporting Enable" + ) + en_relaxed_order: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="Enable Relaxed Ordering") + mps: PcieBitField = PcieBitField(bit_mask=(0x7 << 5), desc="Max_Payload_Size") + ext_tag_field_en: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Extended Tag Field Enable" + ) + phantom_func_en: PcieBitField = PcieBitField(bit_mask=(1 << 9), desc="Phantom Functions Enable") + aux_pwr_pm_en: PcieBitField = PcieBitField(bit_mask=(1 << 10), desc="Aux Power PM Enable") + en_no_snoop: PcieBitField = PcieBitField(bit_mask=(1 << 11), desc="Enable No Snoop") + max_rd_req_size: PcieBitField = PcieBitField(bit_mask=(0x7 << 12), desc="Max_Read_Request_Size") + bridge_cfg_retry_en_init_func_lvl_rst: PcieBitField = PcieBitField( + bit_mask=(1 << 15), + desc="Bridge Configuration Retry Enable / Initiate Function Level Reset", + ) + + +class DevStatRegister(PcieRegister): + offset: int = 0x0A + width: int = 16 + desc: str = "Device Status Register" + corr_err_det: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Correctable Error Detected") + non_fatal_err_det: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Non-Fatal Error Detected" + ) + fatal_err_det: PcieBitField = PcieBitField(bit_mask=(1 << 2), desc="Fatal Error Detected") + ur_det: PcieBitField = PcieBitField(bit_mask=(1 << 3), desc="Unsupported Request Detected") + aux_pwr_det: PcieBitField = PcieBitField(bit_mask=(1 << 4), desc="AUX Power Detected") + trans_pending: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="Transactions Pending") + emer_pwr_reduction_det: PcieBitField = PcieBitField( + bit_mask=(1 << 6), desc="Emergency Power Reduction Detected" + ) + + +class LinkCapRegister(PcieRegister): + offset: int = 0x0C + width: int = 32 + desc: str = "7.5.3.6 Link Capabilities Register (Offset 0Ch)" + max_lnk_speed: PcieBitField = PcieBitField(bit_mask=(0xF << 0), desc="Max Link Speed") + max_lnk_width: PcieBitField = PcieBitField(bit_mask=(0x3F << 4), desc="Maximum Link Width") + aspm_support: PcieBitField = PcieBitField(bit_mask=(0x3 << 10), desc="ASPM Support") + l0s_exit_lat: PcieBitField = PcieBitField(bit_mask=(0x7 << 12), desc="L0s Exit Latency") + l1_exit_lat: PcieBitField = PcieBitField(bit_mask=(0x7 << 15), desc="L1 Exit Latency") + clk_pwr_mgmt: PcieBitField = PcieBitField(bit_mask=(1 << 18), desc="Clock Power Management") + surprise_dn_err_report_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 19), desc="Surprise Down Error Reporting Capable" + ) + dll_lnk_active_report_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Data Link Layer Link Active Reporting Capable" + ) + lnk_bw_notif_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 21), desc="Link Bandwidth Notification Capability" + ) + aspm_optionality_comp: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="ASPM Optionality Compliance" + ) + port_num: PcieBitField = PcieBitField(bit_mask=(0xFF << 24), desc="Port Number") + + +class LinkStatRegister(PcieRegister): + """Link stat for Type 1""" + + offset: int = 0x12 + width: int = 16 + desc: str = "Link Status Register" + curr_lnk_speed: PcieBitField = PcieBitField(bit_mask=(0b1111 << 0), desc="Current Link Speed") + neg_lnk_width: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 4), desc="Negotiated Link Width" + ) + lnk_training: PcieBitField = PcieBitField(bit_mask=(1 << 11), desc="Link Training") + slot_clk_cfg: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Slot Clock Configuration") + dll_lnk_active: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Data Link Layer Link Active" + ) + lnk_bw_mgmt_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Link Bandwidth Management Status" + ) + lnk_auto_bw_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="Link Autonomous Bandwidth Status" + ) + + +class DevCtrl2Register(PcieRegister): + offset: int = 0x28 + width: int = 16 + desc: str = "7.5.3.16 Device Control 2 Register (Offset 28h)" + completion_timeout_val: PcieBitField = PcieBitField( + bit_mask=(0xF << 0), desc="Completion Timeout Value" + ) + completion_timeout_dis: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Completion Timeout Disable" + ) + ari_forward_en: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="ARI Forwarding Enable") + atomic_op_req_en: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="AtomicOp Request Enable") + atomic_op_egress_blk: PcieBitField = PcieBitField( + bit_mask=(1 << 7), desc="AtomicOp Egress Blocking" + ) + ido_req_en: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="IDO Request Enable") + ido_completion_en: PcieBitField = PcieBitField(bit_mask=(1 << 9), desc="IDO Completion Enable") + ltr_mechanism_en: PcieBitField = PcieBitField(bit_mask=(1 << 10), desc="LTR Mechanism Enable") + emergency_pwr_reduction_en: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="Emergency Power Reduction Enable" + ) + ten_bit_tag_req_en: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="10-bit Tag Request Enable" + ) + obff_en: PcieBitField = PcieBitField(bit_mask=(0x3 << 13), desc="OBFF Enable") + end_end_tlp_prefix_blk: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="End-End TLP Prefix Blocking" + ) + + +class LinkCap2Register(PcieRegister): + """Link cap 2 for Type 1""" + + offset: int = 0x2C + width: int = 32 + desc: str = "7.5.3.18 Link Capabilities 2 Register (Offset 2Ch)" + supported_lnk_speed_vec: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 1), desc="Supported Link Speeds Vector" + ) + xlnk_supported: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="Crosslink Supported") + lower_skp_os_gen_supported_speeds_vec: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 9), desc="Lower SKP OS Generation Supported Speeds Vector" + ) + lower_skip_os_rec_supported_speeds_vec: PcieBitField = PcieBitField( + bit_mask=(0b111111 << 16), desc="Lower SKP OS Reception Supported Speeds Vector" + ) + retimer_prsnc_det_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 23), desc="Retimer Presence Detect Supported" + ) + two_retimers_prsnc_det_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="Two Retimers Presence Detect Supported" + ) + drs_supported: PcieBitField = PcieBitField(bit_mask=(1 << 31), desc="DRS Supported") + + +class PcieExp(PcieCapStructure): + """PCIE Express Capability Structure 7.5.3 PCI Express Capability Structure + + This structure allows identification of a PCI Express device Function + and indicates support for new PCI Express features. + """ + + cap_id: ClassVar[Enum] = CapabilityEnum.PCIE_EXP + desc: str = "7.5.3 PCI Express Capability Structure" + cap_list: PcieCapListReg = PcieCapListReg() + pcie_cap_reg: PcieRegister = PcieRegister( + offset=2, + width=16, + desc="7.5.3.2 PCI Express Capabilities Register (Offset 02h)", + ) + dev_cap_reg: PcieRegister = PcieRegister( + offset=0x4, width=32, desc="7.5.3.3 Device Capabilities Register (Offset 04h)" + ) + dev_ctrl_reg: DevCtrlRegister = DevCtrlRegister() + dev_stat_reg: DevStatRegister = DevStatRegister() + lnk_cap_reg: LinkCapRegister = LinkCapRegister() + lnk_ctrl_reg: PcieRegister = PcieRegister( + offset=0x10, width=16, desc="7.5.3.7 Link Control Register (Offset 10h)" + ) + lnk_stat_reg: LinkStatRegister = LinkStatRegister() + dev_ctrl_2_reg: DevCtrl2Register = DevCtrl2Register() + lnk_cap_2_reg: LinkCap2Register = LinkCap2Register() + + +class CapMSIX(PcieCapStructure): + """Capability Structure for MSI-X""" + + cap_id: ClassVar[Enum] = CapabilityEnum.MSIX + offset: int = 0x00 + desc: str = "7.7.2 MSI-X Capability and Table Structure" + + +class CapSATA(PcieCapStructure): + """Cap for Serial ATA Data/Index Configuration""" + + cap_id: ClassVar[Enum] = CapabilityEnum.SATA + offset: int = 0x00 + desc: str = "Serial ATA Data/Index Configuration" + + +class CapAF(PcieCapStructure): + """Capability for Advanced Features""" + + cap_id: ClassVar[Enum] = CapabilityEnum.AF + offset: int = 0x00 + desc: str = "7.9.22 Conventional PCI Advanced Features Capability (AF)" + + +class CapEA(PcieCapStructure): + """Capability for Enhanced Allocation""" + + cap_id: ClassVar[Enum] = CapabilityEnum.EA + offset: int = 0x00 + desc: str = "7.8.5 Enhanced Allocation Capability Structure (EA)" + + +class AerEcapHdr(PcieRegister): + """Capability for Advanced Error Reporting""" + + offset: int = 0x00 + width: int = 32 + desc: str = "7.8.4.1 Advanced Error Reporting Extended Capability Header (Offset 00h)" + pcie_eacp_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="PCI Express Extended Capability ID" + ) + cap_ver: PcieBitField = PcieBitField(bit_mask=0x000F0000, desc="Capability Version") + nxt_cap_offset: PcieBitField = PcieBitField(bit_mask=0xFFF00000, desc="Next Capability Offset") + + +class UncorrErrStatReg(PcieRegister): + """AER register for Uncorrectable Error Status Register""" + + offset: int = 0x04 + width: int = 32 + desc: str = "Uncorrectable Error Status Register" + dlnk_protocol_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Data Link Protocol Error Status" + ) + surprise_dn_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Surprise Down Error Status" + ) + poisoned_tlp_rcvd: PcieBitField = PcieBitField(bit_mask=(1 << 12), desc="Poisoned TLP Received") + fc_proto_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Flow Control Protocol Error Status" + ) + cpl_timeout_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Completion Timeout Status" + ) + ca_stat: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Completer Abort Status") + unexp_cpl_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 16), desc="Unexpected Completion Status" + ) + rx_overflow_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 17), desc="Receiver Overflow Status" + ) + malformed_tlp_stat: PcieBitField = PcieBitField(bit_mask=(1 << 18), desc="Malformed TLP Status") + ecrc_err_stat: PcieBitField = PcieBitField(bit_mask=(1 << 19), desc="ECRC Error Status") + ur_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Unsupported Request Error Status" + ) + acs_violation_stat: PcieBitField = PcieBitField(bit_mask=(1 << 21), desc="ACS Violation Status") + uncorr_int_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="Uncorrectable Internal Error Status" + ) + mc_blocked_tlp_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 23), desc="MC Blocked TLP Status" + ) + atomicop_egress_blk_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="AtomicOp Egress Blocked Status" + ) + tlp_prefix_blk_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 25), desc="TLP Prefix Blocked Error Status" + ) + poisoned_tlp_egress_blk_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 26), desc="Poisoned TLP Egress Blocked Status" + ) + + +class UncorrErrMaskReg(PcieRegister): + """AER register for Uncorrectable Error Mask Register""" + + offset: int = 0x08 + width: int = 32 + desc: str = "7.8.4.3 Uncorrectable Error Mask Register (Offset 08h)" + dlnk_protocol_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Data Link Protocol Error Mask" + ) + surprise_dn_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Surprise Down Error Mask" + ) + poisoned_tlp_rcvd_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Poisoned TLP Received Mask" + ) + fc_proto_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Flow Control Protocol Error Mask" + ) + cpl_timeout_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Completion Timeout Mask" + ) + ca_mask: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Completer Abort Mask") + unexp_cpl_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 16), desc="Unexpected Completion Mask" + ) + rx_overflow_mask: PcieBitField = PcieBitField(bit_mask=(1 << 17), desc="Receiver Overflow Mask") + malformed_tlp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 18), desc="Malformed TLP Mask") + ecrc_err_mask: PcieBitField = PcieBitField(bit_mask=(1 << 19), desc="ECRC Error Mask") + ur_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Unsupported Request Error Mask" + ) + acs_violation_mask: PcieBitField = PcieBitField(bit_mask=(1 << 21), desc="ACS Violation Mask") + uncorr_int_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="Uncorrectable Internal Error Mask" + ) + mc_blocked_tlp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 23), desc="MC Blocked TLP Mask") + atomicop_egress_blk_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="AtomicOp Egress Blocked Mask" + ) + tlp_prefix_blk_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 25), desc="TLP Prefix Blocked Error Mask" + ) + poisoned_tlp_egress_blk_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 26), desc="Poisoned TLP Egress Blocked Mask" + ) + + +class UncorrErrSevReg(PcieRegister): + """AER register for Uncorrectable Error Severity Register""" + + offset: int = 0x0C + width: int = 32 + desc: str = "7.8.4.4 Uncorrectable Error Severity Register (Offset 0Ch)" + dlnk_protocol_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Data Link Protocol Error Severity" + ) + surprise_dn_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Surprise Down Error Severity" + ) + poisoned_tlp_rcvd_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Poisoned TLP Received Severity" + ) + fc_proto_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Flow Control Protocol Error Severity" + ) + cpl_timeout_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Completion Timeout Error Severity" + ) + ca_sev: PcieBitField = PcieBitField(bit_mask=(1 << 15), desc="Completer Abort Error Severity") + unexp_cpl_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 16), desc="Unexpected Completion Error Severity" + ) + rx_overflow_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 17), desc="Receiver Overflow Severity" + ) + malformed_tlp_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 18), desc="Malformed TLP Severity" + ) + ecrc_err_sev: PcieBitField = PcieBitField(bit_mask=(1 << 19), desc="ECRC Error Severity") + ur_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 20), desc="Unsupported Request Error Severity" + ) + acs_violation_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 21), desc="ACS Violation Severity" + ) + uncorr_int_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 22), desc="Uncorrectable Internal Error Severity" + ) + mc_blocked_tlp_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 23), desc="MC Blocked TLP Severity" + ) + atomicop_egress_blk_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 24), desc="AtomicOp Egress Blocked Severity" + ) + tlp_prefix_blk_err_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 25), desc="TLP Prefix Blocked Error Severity" + ) + poisoned_tlp_egress_blk_sev: PcieBitField = PcieBitField( + bit_mask=(1 << 26), desc="Poisoned TLP Egress Blocked Severity" + ) + + +class CorrErrStatReg(PcieRegister): + """AER register for Correctable Error Status Register""" + + offset: int = 0x10 + width: int = 32 + desc: str = "Correctable Error Status Register" + rx_err_stat: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Receiver Error Status") + bad_tlp_stat: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="Bad TLP Status") + bad_dllp_stat: PcieBitField = PcieBitField(bit_mask=(1 << 7), desc="Bad DLLP Status") + replay_num_rollover_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="REPLAY_NUM Rollover Status" + ) + replay_timer_timeout_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Replay Timer Timeout Status" + ) + advisory_non_fatal_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Advisory Non-Fatal Error Status" + ) + corrected_int_err_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Corrected Internal Error Status" + ) + hdr_log_overflow_stat: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="Header Log Overflow Status" + ) + + +class CorrErrMaskReg(PcieRegister): + """AER register for Correctable Error Mask Register""" + + offset: int = 0x14 + width: int = 32 + desc: str = "7.8.4.6 Correctable Error Mask Register (Offset 14h)" + rx_err_mask: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="Receiver Error Mask") + bad_tlp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="Bad TLP Mask") + bad_dllp_mask: PcieBitField = PcieBitField(bit_mask=(1 << 7), desc="Bad DLLP Mask") + replay_num_rollover_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="REPLAY_NUM Rollover Mask" + ) + replay_timer_timeout_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Replay Timer Timeout Mask" + ) + advisory_non_fatal_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 13), desc="Advisory Non-Fatal Error Mask" + ) + corrected_int_err_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 14), desc="Corrected Internal Error Mask" + ) + hdr_log_overflow_mask: PcieBitField = PcieBitField( + bit_mask=(1 << 15), desc="Header Log Overflow Mask" + ) + + +class AerCapCtrlReg(PcieRegister): + """AER register for Advanced Error Capabilities and Control Register""" + + offset: int = 0x18 + width: int = 32 + desc: str = "7.8.4.7 Advanced Error Capabilities and Control Register (Offset 18h)" + fst_err_ptr: PcieBitField = PcieBitField(bit_mask=(0x1F), desc="First Error Pointer") + ecrc_gen_cap: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="ECRC Generation Capable") + ecrc_gen_en: PcieBitField = PcieBitField(bit_mask=(1 << 6), desc="ECRC Generation Enable") + ecrc_chk_cap: PcieBitField = PcieBitField(bit_mask=(1 << 7), desc="ECRC Check Capable") + ecrc_chk_en: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="ECRC Check Enable") + multi_hdr_rec_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Multiple Header Recording Capable" + ) + multi_hdr_rec_en: PcieBitField = PcieBitField( + bit_mask=(1 << 10), desc="Multiple Header Recording Enable" + ) + tlp_prefix_log_prsnt: PcieBitField = PcieBitField( + bit_mask=(1 << 11), desc="TLP Prefix Log Present" + ) + cpl_timeout_prefix_hdr_log_cap: PcieBitField = PcieBitField( + bit_mask=(1 << 12), desc="Completion Timeout Prefix/Header Log Capable" + ) + + +class RootErrCmdReg(PcieRegister): + """AER register for Root Error Command Register""" + + offset: int = 0x2C + width: int = 32 + desc: str = "7.8.4.9 Root Error Command Register (Offset 2Ch)" + corr_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Correctable Error Reporting Enable" + ) + non_fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Non-Fatal Error Reporting Enable" + ) + fatal_err_report_en: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Fatal Error Reporting Enable" + ) + + +class RootErrStatReg(PcieRegister): + """AER register for Root Error Status Register""" + + offset: int = 0x30 + width: int = 32 + desc: str = "Root Error Status Register" + err_cor_rcvd: PcieBitField = PcieBitField(bit_mask=(1 << 0), desc="ERR_COR Received") + multi_err_cor_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Multiple ERR_COR Received" + ) + err_fatal_nonfatal_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="ERR_FATAL/NONFATAL Received" + ) + multi_err_fatal_nonfatal_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Multiple ERR_FATAL/NONFATAL Received" + ) + fst_uncorr_fatal: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="First Uncorrectable Fatal" + ) + non_fatal_err_msg_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 5), desc="Non-Fatal Error Messages Received" + ) + fatal_err_msg_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 6), desc="Fatal Error Messages Received" + ) + err_cor_subclass: PcieBitField = PcieBitField(bit_mask=(0x3 << 7), desc="ERR_COR Subclass") + adv_err_int_msg_num: PcieBitField = PcieBitField( + bit_mask=(0x1F << 27), desc="Advanced Error Interrupt Message Number" + ) + + +class ErrSrcIdReg(PcieRegister): + """AER register for Error Source Identification Register""" + + offset: int = 0x34 + width: int = 32 + desc: str = "7.8.4.11 Error Source Identification Register (Offset 34h)" + err_cor_src_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="ERR_COR Source Identification" + ) + err_fatal_nonfatal_src_id: PcieBitField = PcieBitField( + bit_mask=0xFFFF0000, desc="ERR_FATAL/NONFATAL Source Identification" + ) + + +class ECapAer(PcieCapStructure): + """Extended Capability for Advanced Error Reporting""" + + extended: Optional[bool] = True + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.AER + offset: int = 0x00 + desc: str = "7.8.4 Advanced Error Reporting Extended Capability" + aer_ecap: AerEcapHdr = AerEcapHdr() + uncorr_err_stat: UncorrErrStatReg = UncorrErrStatReg() + uncorr_err_mask: UncorrErrMaskReg = UncorrErrMaskReg() + uncorr_err_sev: UncorrErrSevReg = UncorrErrSevReg() + corr_err_stat: CorrErrStatReg = CorrErrStatReg() + corr_err_mask: CorrErrMaskReg = CorrErrMaskReg() + aer_cap_ctrl: AerCapCtrlReg = AerCapCtrlReg() + root_err_cmd: RootErrCmdReg = RootErrCmdReg() + root_err_stat: RootErrStatReg = RootErrStatReg() + err_src_id: ErrSrcIdReg = ErrSrcIdReg() + + +class ECapVc(PcieCapStructure): + """Extended Capability for Virtual Channel""" + + extended: Optional[bool] = True + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VCEC + offset: int = 0x00 + desc: str = "7.9.1 Virtual Channel Extended Capability" + + +class ECapDsn(PcieCapStructure): + """Extended Capability for Device Serial Number""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DSN + offset: int = 0x00 + desc: str = "7.9.3 Device Serial Number Extended Capability" + + +class ECapPb(PcieCapStructure): + """Extended Capability for Power Budgeting""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PWR_BUDGET + offset: int = 0x00 + desc: str = "7.8.1 Power Budgeting Extended Capability" + + +class ECapRclink(PcieCapStructure): + """Extended Capability for Root Complex Link Declaration""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LNK_DCLR + offset: int = 0x00 + desc: str = "7.9.8.1 Root Complex Link Declaration Extended Capability Header (Offset 00h)" + + +class ECapRcilink(PcieCapStructure): + """Extended Capability for Root Complex Internal Link Control""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LNK_CEC + offset: int = 0x00 + desc: str = "7.9.9 Root Complex Internal Link Control Extended Capability" + + +class ECapRcecoll(PcieCapStructure): + """Extended Capability for Root Complex Event Collector Endpoint Association""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RCECOLL + offset: int = 0x00 + desc: str = ( + "7.9.10 Root Complex Event Collector Endpoint Association Extended Capability (Dell)" + ) + + +class ECapMfvc(PcieCapStructure): + """Extended Capability for Multi-Function Virtual Channel""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MFVC + offset: int = 0x00 + desc: str = "7.9.2 Multi-Function Virtual Channel Extended Capability" + + +class ECapVc2(PcieCapStructure): + """Extended Capability for Virtual Channel 2""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VC2 + offset: int = 0x00 + desc: str = "7.9.1 Virtual Channel Extended Capability" + + +class ECapRcrb(PcieCapStructure): + """Extended Capability for RCRB Header""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RCRB + offset: int = 0x00 + desc: str = "7.9.7 RCRB Header Extended Capability" + + +class ECapVndr(PcieCapStructure): + """Extended Capability for Vendor-Specific""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VNDR + offset: int = 0x00 + desc: str = "7.9.5 Vendor-Specific Extended Capability" + + +class ECapCac(PcieCapStructure): + """Extended Capability for Configuration Access Correlation""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.CAC + offset: int = 0x00 + desc: str = "7.7. Configuration Access Correlation Extended Capability" + + +class ECapAcs(PcieCapStructure): + """Extended Capability for ACS""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ACS + offset: int = 0x00 + desc: str = "7.7.8 ACS Extended Capability" + + +class ECapAri(PcieCapStructure): + """Extended Capability for ARI""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ARI + offset: int = 0x00 + desc: str = "7.8.7 ARI Extended Capability" + + +class ECapAts(PcieCapStructure): + """Extended Capability for ATS""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ATS + offset: int = 0x00 + desc: str = "10.5.1 ATS Extended Capability" + + +class ECapSriov(PcieCapStructure): + """Extended Capability for SR-IOV""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.SRIOV + offset: int = 0x00 + desc: str = "9.3.3 SR-IOV Extended Capability" + + +class ECapMriov(PcieCapStructure): + """Extended Capability for MR-IOV""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MRIOV + offset: int = 0x00 + desc: str = "MR-IOV Extended Capability (MR-IOV)" + + +class ECapMcast(PcieCapStructure): + """Extended Capability for Multicast""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MULTCAST + offset: int = 0x00 + desc: str = "7.9.11 Multicast Extended Capability" + + +class ECapPri(PcieCapStructure): + """Extended Capability for Page Request Interface""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PAGE_REQ + offset: int = 0x00 + desc: str = "10.5.2 Page Request Extended Capability Structure" + + +class ECapAMD(PcieCapStructure): + """Extended Capability for AMD""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.AMD + offset: int = 0x00 + desc: str = "Reserved for AMD" + + +class ECapReba(PcieCapStructure): + """Extended Capability for Resizable BAR""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RBAR + offset: int = 0x00 + desc: str = "7.8.6 Resizable BAR Extended Capability" + + +class ECapDpa(PcieCapStructure): + """Extended Capability for Dynamic Power Allocation""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DPA + offset: int = 0x00 + desc: str = "7.9.12 Dynamic Power Allocation Extended Capability (DPA Capability)" + + +class ECapTph(PcieCapStructure): + """Extended Capability for TPH""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.TPH + offset: int = 0x00 + desc: str = "7.9.13.1 TPH Requester Extended Capability Header (Offset 00h)" + + +class ECapLtr(PcieCapStructure): + """Extended Capability for LTR""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LTR + offset: int = 0x00 + desc: str = "7.8.2 Latency Tolerance Reporting (LTR) Extended Capability" + + +class LaneErrorStatReg(PcieRegister): + """Lane error status register""" + + desc: str = "Lane Error Status Register" + offset: int = 0x08 + width: int = 32 + lane0_err_stat: PcieBitField = PcieBitField( + bit_mask=0xFFFFFFFF, + desc="Lane Error Status Bits - Each bit indicates if the corresponding Lane detected a Lane-based error.", + ) + + +class ECapSecpci(PcieCapStructure): + """Extended Capability for Secondary PCI Express""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.SPCI + offset: int = 0x00 + desc: str = "7.7.3 Secondary PCI Express Extended Capability" + lane_err_stat: LaneErrorStatReg = LaneErrorStatReg() + + +class ECapPmux(PcieCapStructure): + """Extended Capability for PMUX""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PMUX + offset: int = 0x00 + desc: str = "G.5 PMUX Extended Capability" + + +class ECapPasid(PcieCapStructure): + """Extended Capability for PASID""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PASID + offset: int = 0x00 + desc: str = "7.8.8 PASID Extended Capability Structure" + + +class ECapLnr(PcieCapStructure): + """Extended Capability for LN Requester""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LN + offset: int = 0x00 + desc: str = "7.9.14 LN Requester Extended Capability (LNR Capability)" + + +class ECapDpc(PcieCapStructure): + """Extended Capability for DPC""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DPC + offset: int = 0x00 + desc: str = "7.9.15 DPC Extended Capability" + + +class ECapL1pm(PcieCapStructure): + """Extended Capability for L1 PM Substates""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.L1PM + offset: int = 0x00 + desc: str = "7.8.3 L1 PM Substates Extended Capability" + + +class ECapPtm(PcieCapStructure): + """Extended Capability for PTM""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PTM + offset: int = 0x00 + desc: str = "7.9.16 Precision Time Management Extended Capability (PTM Capability)" + + +class ECapMpcie(PcieCapStructure): + """Extended Capability for M-PCIe""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.MPCIE + offset: int = 0x00 + desc: str = "PCI Express over M-PHY Extended Capability (M-PCIe)" + + +class ECapFrs(PcieCapStructure): + """Extended Capability for FRS Queueing""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.FRS + offset: int = 0x00 + desc: str = "7.8.9 FRS Queueing Extended Capability" + + +class ECapRtr(PcieCapStructure): + """Extended Capability for Readiness Time Reporting""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.RTR + offset: int = 0x00 + desc: str = "7.9.17 Readiness Time Reporting Extended Capability" + + +class ECapDvsec(PcieCapStructure): + """Extended Capability for Designated Vendor-Specific""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DVENDR + offset: int = 0x00 + desc: str = "7.9.6 Designated Vendor-Specific Extended Capability (DVSEC)" + + +class ECapVfRebar(PcieCapStructure): + """Extended Capability for VF Resizable BAR""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.VFBAR + offset: int = 0x00 + desc: str = "9.3.7.5 VF Resizable BAR Extended Capability" + + +class ECapDlnk(PcieCapStructure): + """Extended Capability for Downstream Link""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DLF + offset: int = 0x00 + desc: str = "7.7.4 Data Link Feature Extended Capability" + + +class Phy16GtEcapHdr(PcieRegister): + """Extended Capability for 16.0 GT/s Physical Layer""" + + offset: int = 0x00 + width: int = 32 + desc: str = "7.7.5.1 Physical Layer 16.0 GT/s Extended Capability Header (Offset 00h)" + pcie_ecap_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="PCI Express Extended Capability ID" + ) + cap_ver: PcieBitField = PcieBitField(bit_mask=0x000F0000, desc="Capability Version") + nxt_cap_offset: PcieBitField = PcieBitField(bit_mask=0xFFF00000, desc="Next Capability Offset") + + +class Phy16GtEcapStat(PcieRegister): + """Register for 16.0 GT/s Physical Layer Status""" + + offset: int = 0x0C + width: int = 32 + desc: str = "16.0 GT/s Status Register" + eq_16gt_cpl: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Equalization 16.0 GT/s Complete" + ) + eq_16gt_ph1_success: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Equalization 16.0 GT/s Phase 1 Successful" + ) + eq_16gt_ph2_success: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Equalization 16.0 GT/s Phase 2 Successful" + ) + eq_16gt_ph3_success: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Equalization 16.0 GT/s Phase 3 Successful" + ) + lnk_eq_req_16gt: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Link Equalization Request 16.0 GT/s" + ) + + +class ParityMisMatchStat16GT(PcieRegister): + """Register for 16.0 GT/s Parity Mismatch Status""" + + pos: int = 10 + width: int = 32 + offset: int = 0x10 + desc: str = "16.0 GT/s Local Data Parity Mismatch Status Register" + + +class RetimerFstPartiyRetimerMismatchStat16gt(PcieRegister): + """Rgister for 16.0 GT/s First Retimer Data Parity Mismatch Status""" + + pos: int = 14 + width: int = 32 + offset: int = 0x14 + desc: str = "16.0 GT/s First Retimer Data Parity Mismatch Status Register" + + +class RetimerSecPartiyRetimerMismatchStat16gt(PcieRegister): + """Register for 16.0 GT/s Second Retimer Data Parity Mismatch Status""" + + pos: int = 18 + width: int = 32 + offset: int = 0x18 + desc: str = "16.0 GT/s Second Retimer Data Parity Mismatch Status Register" + + +class EqCtl16Gt0(PcieRegister): + """Register for 16.0 GT/s Equalization Control 0""" + + offset: int + width: int = 8 + desc: str = "7.7.5.9 16.0 GT/s Lane Equalization Control Register (Offsets 20h to 3Ch)" + upstream_eq_ctl_16gt_0: PcieBitField = PcieBitField( + bit_mask=0x000000FF, desc="Upstream Equalization Control 16.0 GT/s 0" + ) + downstream_eq_ctl_16gt_0: PcieBitField = PcieBitField( + bit_mask=0x0000FF00, desc="Downstream Equalization Control 16.0 GT/s 0" + ) + + +class ECap16Gt(PcieCapStructure): + """Extended Capability for 16.0 GT/s Physical Layer""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PL_16GT + offset: int = 0x00 + desc: str = "7.7.5 Physical Layer 16.0 GT/s Extended Capability" + header: Phy16GtEcapHdr = Phy16GtEcapHdr() + status: Phy16GtEcapStat = Phy16GtEcapStat() + parity_mismatch_stat: ParityMisMatchStat16GT = ParityMisMatchStat16GT() + retimer_fst_parity_mismatch_stat: RetimerFstPartiyRetimerMismatchStat16gt = ( + RetimerFstPartiyRetimerMismatchStat16gt() + ) + retimer_sec_parity_mismatch_stat: RetimerSecPartiyRetimerMismatchStat16gt = ( + RetimerSecPartiyRetimerMismatchStat16gt() + ) + eq_ctl_16gt_0: EqCtl16Gt0 = EqCtl16Gt0(offset=0x20, desc="16GT/s Equalization Control 0") + eq_ctl_16gt_1: EqCtl16Gt0 = EqCtl16Gt0(offset=0x21, desc="16GT/s Equalization Control 1") + eq_ctl_16gt_2: EqCtl16Gt0 = EqCtl16Gt0(offset=0x22, desc="16GT/s Equalization Control 2") + eq_ctl_16gt_3: EqCtl16Gt0 = EqCtl16Gt0(offset=0x23, desc="16GT/s Equalization Control 3") + eq_ctl_16gt_4: EqCtl16Gt0 = EqCtl16Gt0(offset=0x24, desc="16GT/s Equalization Control 4") + eq_ctl_16gt_5: EqCtl16Gt0 = EqCtl16Gt0(offset=0x25, desc="16GT/s Equalization Control 5") + eq_ctl_16gt_6: EqCtl16Gt0 = EqCtl16Gt0(offset=0x26, desc="16GT/s Equalization Control 6") + eq_ctl_16gt_7: EqCtl16Gt0 = EqCtl16Gt0(offset=0x27, desc="16GT/s Equalization Control 7") + eq_ctl_16gt_8: EqCtl16Gt0 = EqCtl16Gt0(offset=0x28, desc="16GT/s Equalization Control 8") + eq_ctl_16gt_9: EqCtl16Gt0 = EqCtl16Gt0(offset=0x29, desc="16GT/s Equalization Control 9") + eq_ctl_16gt_10: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2A, desc="16GT/s Equalization Control 10") + eq_ctl_16gt_11: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2B, desc="16GT/s Equalization Control 11") + eq_ctl_16gt_12: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2C, desc="16GT/s Equalization Control 12") + eq_ctl_16gt_13: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2D, desc="16GT/s Equalization Control 13") + eq_ctl_16gt_14: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2E, desc="16GT/s Equalization Control 14") + eq_ctl_16gt_15: EqCtl16Gt0 = EqCtl16Gt0(offset=0x2F, desc="16GT/s Equalization Control 15") + eq_ctl_16gt_16: EqCtl16Gt0 = EqCtl16Gt0(offset=0x30, desc="16GT/s Equalization Control 16") + eq_ctl_16gt_17: EqCtl16Gt0 = EqCtl16Gt0(offset=0x31, desc="16GT/s Equalization Control 17") + eq_ctl_16gt_18: EqCtl16Gt0 = EqCtl16Gt0(offset=0x32, desc="16GT/s Equalization Control 18") + eq_ctl_16gt_19: EqCtl16Gt0 = EqCtl16Gt0(offset=0x33, desc="16GT/s Equalization Control 19") + eq_ctl_16gt_20: EqCtl16Gt0 = EqCtl16Gt0(offset=0x34, desc="16GT/s Equalization Control 20") + eq_ctl_16gt_21: EqCtl16Gt0 = EqCtl16Gt0(offset=0x35, desc="16GT/s Equalization Control 21") + eq_ctl_16gt_22: EqCtl16Gt0 = EqCtl16Gt0(offset=0x36, desc="16GT/s Equalization Control 22") + eq_ctl_16gt_23: EqCtl16Gt0 = EqCtl16Gt0(offset=0x37, desc="16GT/s Equalization Control 23") + eq_ctl_16gt_24: EqCtl16Gt0 = EqCtl16Gt0(offset=0x38, desc="16GT/s Equalization Control 24") + eq_ctl_16gt_25: EqCtl16Gt0 = EqCtl16Gt0(offset=0x39, desc="16GT/s Equalization Control 25") + eq_ctl_16gt_26: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3A, desc="16GT/s Equalization Control 26") + eq_ctl_16gt_27: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3B, desc="16GT/s Equalization Control 27") + eq_ctl_16gt_28: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3C, desc="16GT/s Equalization Control 28") + eq_ctl_16gt_29: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3D, desc="16GT/s Equalization Control 29") + eq_ctl_16gt_30: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3E, desc="16GT/s Equalization Control 30") + eq_ctl_16gt_31: EqCtl16Gt0 = EqCtl16Gt0(offset=0x3F, desc="16GT/s Equalization Control 31") + + +class ECapLmr(PcieCapStructure): + """Extended Capability for Lane Margining at the Receiver""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.LM + offset: int = 0x00 + desc: str = "7.7.7 Lane Margining at the Receiver Extended Capability" + + +class ECapHierId(PcieCapStructure): + """Extended Capability for Hierarchy ID""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.HID + offset: int = 0x00 + desc: str = "7.9.18 Hierarchy ID Extended Capability" + + +class ECapNpem(PcieCapStructure): + """Extended Capability for Native PCIe Enclosure Management""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.NPEM + offset: int = 0x00 + desc: str = ( + "7.9.20 Native PCIe Enclosure Management Extended Capability (NPEM Extended Capability)" + ) + + +class Phy32GtEcapHdr(PcieRegister): + """Extended Capability for 32.0 GT/s Physical Layer""" + + offset: int = 0x00 + width: int = 32 + desc: str = "7.7.6.1 Physical Layer 32.0 GT/s Extended Capability Header (Offset 00h)" + pcie_ecap_id: PcieBitField = PcieBitField( + bit_mask=0x0000FFFF, desc="PCI Express Extended Capability ID" + ) + cap_ver: PcieBitField = PcieBitField(bit_mask=0x000F0000, desc="Capability Version") + nxt_cap_offset: PcieBitField = PcieBitField(bit_mask=0xFFF00000, desc="Next Capability Offset") + + +class Phy32GtEcapCapReg(PcieRegister): + """Register for 32.0 GT/s Capabilities""" + + offset: int = 0x04 + width: int = 32 + desc: str = "7.7.6.2 32.0 GT/s Capabilities Register (Offset 04h" + eq_bypass_hi_rate: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Equalization bypass to highest rate Supported" + ) + no_equi_needed: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="No Equalization Needed Supported - When Set" + ) + modified_ts_usage_mode_0_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 8), desc="Modified TS Usage Mode 0 Supported" + ) + modified_ts_usage_mode_1_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Modified TS Usage Mode 1 Supported" + ) + modified_ts_usage_mode_2_supported: PcieBitField = PcieBitField( + bit_mask=(1 << 10), desc="Modified TS Usage Mode 2 Supported" + ) + modified_ts_reserved_usage_modes: PcieBitField = PcieBitField( + bit_mask=(0x1F << 11), desc="Modified TS Reserved Usage Modes" + ) + + +class Phy32GtStatReg(PcieRegister): + """Register for 32.0 GT/s Status""" + + offset: int = 0x0C + width: int = 32 + desc: str = "32.0 GT/s Status Register" + eq_32gt_cpl: PcieBitField = PcieBitField( + bit_mask=(1 << 0), desc="Equalization 32.0 GT/s Complete" + ) + eq_32gt_ph1_success: PcieBitField = PcieBitField( + bit_mask=(1 << 1), desc="Equalization 32.0 GT/s Phase 1 Successful" + ) + eq_32gt_ph2_success: PcieBitField = PcieBitField( + bit_mask=(1 << 2), desc="Equalization 32.0 GT/s Phase 2 Successful" + ) + eq_32gt_ph3_success: PcieBitField = PcieBitField( + bit_mask=(1 << 3), desc="Equalization 32.0 GT/s Phase 3 Successful" + ) + lnk_eq_req_32gt: PcieBitField = PcieBitField( + bit_mask=(1 << 4), desc="Link Equalization Request 32.0 GT/s" + ) + modified_ts_rcvd: PcieBitField = PcieBitField(bit_mask=(1 << 5), desc="Modified TS Received") + rcvd_enhanced_link_behav_ctrl: PcieBitField = PcieBitField( + bit_mask=(0x3 << 6), desc="Received Enhanced Link Behavior Control" + ) + tx_precoding_on: PcieBitField = PcieBitField(bit_mask=(1 << 8), desc="Transmitter Precoding On") + tx_precoding_req: PcieBitField = PcieBitField( + bit_mask=(1 << 9), desc="Transmitter Precode Request" + ) + no_eq_needed_rcvd: PcieBitField = PcieBitField( + bit_mask=(1 << 10), desc="No Equalization Needed Received" + ) + + +class TransReceived32GTData1(PcieRegister): + """Register for 32.0 GT/s Received Modified TS Data 1""" + + offset: int = 0x10 + width: int = 32 + desc: str = "7.7.6.5 Received Modified TS Data 1 Register (Offset 10h)" + rcvd_mod_ts_usage_mode: PcieBitField = PcieBitField( + bit_mask=(0x7 << 0), desc="Received Modified TS Usage Mode" + ) + rcvd_mod_ts_info_1: PcieBitField = PcieBitField( + bit_mask=(0xFFF << 3), desc="Received Modified TS Information 1" + ) + rcvd_mod_ts_vendor_id: PcieBitField = PcieBitField( + bit_mask=(0xFFFF << 16), desc="Received Modified TS Vendor ID" + ) + + +# 23:0 Received Modified TS Information 2 +# 25:24 Alternate Protocol Negotiation Status +class TransReceived32GTData2(PcieRegister): + """Register for 32.0 GT/s Received Modified TS Data 2""" + + offset: int = 0x14 + width: int = 32 + desc: str = "7.7.6.6 Received Modified TS Data 2 Register (Offset 14h)" + rcvd_mod_ts_info_2: PcieBitField = PcieBitField( + bit_mask=(0x7FF << 0), desc="Received Modified TS Information 2" + ) + alt_proto_neg_status: PcieBitField = PcieBitField( + bit_mask=(0x3 << 24), desc="Alternate Protocol Negotiation Status" + ) + + +class EqCtl32Gt0(PcieRegister): + """Equalization Control for 32.0 GT/s""" + + offset: int + width: int = 8 + desc: str = "7.7.6.9 32.0 GT/s Lane Equalization Control Register (Offset 20h to 3Ch)" + upstream_eq_ctl_32gt_0: PcieBitField = PcieBitField( + bit_mask=0x000000FF, desc="Upstream Equalization Control 32.0 GT/s 0" + ) + downstream_eq_ctl_32gt_0: PcieBitField = PcieBitField( + bit_mask=0x0000FF00, desc="Downstream Equalization Control 32.0 GT/s 0" + ) + + +class ECap32Gts(PcieCapStructure): + """Extended Capability for 32.0 GT/s Physical Layer""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.PL_32GT + offset: int = 0x00 + desc: str = "7.7.6 Physical Layer 32.0 GT/s Extended Capability" + header: Phy32GtEcapHdr = Phy32GtEcapHdr() + cap_reg: Phy32GtEcapCapReg = Phy32GtEcapCapReg() + status: Phy32GtStatReg = Phy32GtStatReg() + recv_data_1: TransReceived32GTData1 = TransReceived32GTData1() + recv_data_2: TransReceived32GTData2 = TransReceived32GTData2() + trans_data_1: TransReceived32GTData1 = TransReceived32GTData1(offset=0x18) + trans_data_2: TransReceived32GTData2 = TransReceived32GTData2(offset=0x1C) + eq_ctl_32gt_0: EqCtl32Gt0 = EqCtl32Gt0(offset=0x20, desc="32GT/s Equalization Control 0") + eq_ctl_32gt_1: EqCtl32Gt0 = EqCtl32Gt0(offset=0x21, desc="32GT/s Equalization Control 1") + eq_ctl_32gt_2: EqCtl32Gt0 = EqCtl32Gt0(offset=0x22, desc="32GT/s Equalization Control 2") + eq_ctl_32gt_3: EqCtl32Gt0 = EqCtl32Gt0(offset=0x23, desc="32GT/s Equalization Control 3") + eq_ctl_32gt_4: EqCtl32Gt0 = EqCtl32Gt0(offset=0x24, desc="32GT/s Equalization Control 4") + eq_ctl_32gt_5: EqCtl32Gt0 = EqCtl32Gt0(offset=0x25, desc="32GT/s Equalization Control 5") + eq_ctl_32gt_6: EqCtl32Gt0 = EqCtl32Gt0(offset=0x26, desc="32GT/s Equalization Control 6") + eq_ctl_32gt_7: EqCtl32Gt0 = EqCtl32Gt0(offset=0x27, desc="32GT/s Equalization Control 7") + eq_ctl_32gt_8: EqCtl32Gt0 = EqCtl32Gt0(offset=0x28, desc="32GT/s Equalization Control 8") + eq_ctl_32gt_9: EqCtl32Gt0 = EqCtl32Gt0(offset=0x29, desc="32GT/s Equalization Control 9") + eq_ctl_32gt_10: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2A, desc="32GT/s Equalization Control 10") + eq_ctl_32gt_11: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2B, desc="32GT/s Equalization Control 11") + eq_ctl_32gt_12: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2C, desc="32GT/s Equalization Control 12") + eq_ctl_32gt_13: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2D, desc="32GT/s Equalization Control 13") + eq_ctl_32gt_14: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2E, desc="32GT/s Equalization Control 14") + eq_ctl_32gt_15: EqCtl32Gt0 = EqCtl32Gt0(offset=0x2F, desc="32GT/s Equalization Control 15") + eq_ctl_32gt_32: EqCtl32Gt0 = EqCtl32Gt0(offset=0x30, desc="32GT/s Equalization Control 32") + eq_ctl_32gt_17: EqCtl32Gt0 = EqCtl32Gt0(offset=0x31, desc="32GT/s Equalization Control 17") + eq_ctl_32gt_18: EqCtl32Gt0 = EqCtl32Gt0(offset=0x32, desc="32GT/s Equalization Control 18") + eq_ctl_32gt_19: EqCtl32Gt0 = EqCtl32Gt0(offset=0x33, desc="32GT/s Equalization Control 19") + eq_ctl_32gt_20: EqCtl32Gt0 = EqCtl32Gt0(offset=0x34, desc="32GT/s Equalization Control 20") + eq_ctl_32gt_21: EqCtl32Gt0 = EqCtl32Gt0(offset=0x35, desc="32GT/s Equalization Control 21") + eq_ctl_32gt_22: EqCtl32Gt0 = EqCtl32Gt0(offset=0x36, desc="32GT/s Equalization Control 22") + eq_ctl_32gt_23: EqCtl32Gt0 = EqCtl32Gt0(offset=0x37, desc="32GT/s Equalization Control 23") + eq_ctl_32gt_24: EqCtl32Gt0 = EqCtl32Gt0(offset=0x38, desc="32GT/s Equalization Control 24") + eq_ctl_32gt_25: EqCtl32Gt0 = EqCtl32Gt0(offset=0x39, desc="32GT/s Equalization Control 25") + eq_ctl_32gt_26: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3A, desc="32GT/s Equalization Control 26") + eq_ctl_32gt_27: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3B, desc="32GT/s Equalization Control 27") + eq_ctl_32gt_28: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3C, desc="32GT/s Equalization Control 28") + eq_ctl_32gt_29: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3D, desc="32GT/s Equalization Control 29") + eq_ctl_32gt_30: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3E, desc="32GT/s Equalization Control 30") + eq_ctl_32gt_31: EqCtl32Gt0 = EqCtl32Gt0(offset=0x3F, desc="32GT/s Equalization Control 31") + + +class ECapAltProtocol(PcieCapStructure): + """Extended Capability for Alternate Protocol""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.ALT_PROTOCOL + offset: int = 0x00 + desc: str = "7.9.21 Alternate Protocol Extended Capability" + + +class ECapSfi(PcieCapStructure): + """Extended Capability for System Firmware Intermediary""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.SFI + offset: int = 0x00 + desc: str = "7.9.23 System Firmware Intermediary (SFI) Extended Capability" + + +class ECapDoe(PcieCapStructure): + """Extended Capability for DOE""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.DOE + offset: int = 0x00 + desc: str = "Cap DOE" + + +class ECapIntegrityDoe(PcieCapStructure): + """Extended Capability for Integrity DOE""" + + cap_id: ClassVar[Enum] = ExtendedCapabilityEnum.INT_DOE + offset: int = 0x00 + desc: str = "Int Cap DOE" + + +class PcieCfgSpace(BaseModel): + """Holds the base registers and capability structures of a PCIe device + + - type_0_configuration: Type 0 Configuration Space, this is both the shared registers and the type0 specific registers + - type_1_configuration: Type 1 Configuration Space, this is both the shared registers and the type1 specific registers + - capability_pointers: A dictionary of capability pointers to the offset of the capability structure + - extended_capability_pointers: A dictionary of extended capability pointers to the offset of the extended capability structure + - cap_structure: A dictionary of capability structures + - ecap_structure: A dictionary of extended capability structures + + """ + + type_0_configuration: Type0Configuration = Type0Configuration() + type_1_configuration: Type1Configuration = Type1Configuration() + capability_pointers: Dict[CapabilityEnum, int] = {} + extended_capability_pointers: Dict[ExtendedCapabilityEnum, int] = {} + # SerializeAsAny is used to allow for the structure to be any of the capability structures so all registers and fields are dumped + cap_structure: Dict[CapabilityEnum, SerializeAsAny[PcieCapStructure]] = {} + ecap_structure: Dict[ExtendedCapabilityEnum, SerializeAsAny[PcieCapStructure]] = {} + + def get_struct(self, struct: type[AnyCap]) -> Optional[AnyCap]: + """Get a structure from the cap_structure or ecap_structure based on the type + + Parameters + ---------- + struct : type[AnyCap] + The structure to get from the cap_structure or ecap_structure + + Returns + ------- + Optional[AnyCap] + The structure if it exists, otherwise None + """ + if struct == Type0Configuration: + return self.type_0_configuration # type: ignore[return-value] + if struct == Type1Configuration: + return self.type_1_configuration # type: ignore[return-value] + + if hasattr(struct, "cap_id"): + cap = self.cap_structure.get(struct.cap_id, None) # type: ignore[attr-defined] + if cap: + return cap # type: ignore[return-value] + ecap = self.ecap_structure.get(struct.cap_id, None) # type: ignore[attr-defined] + if ecap: + return ecap # type: ignore[return-value] + return None + + @field_validator("extended_capability_pointers", mode="before") + @classmethod + def str_to_enum_extended(cls, dict_in: Dict[str, int]) -> Dict[Enum, int]: + """Converts a dictionary with string keys to Enum keys + + Parameters + ---------- + dict_in : Dict[str, int] + The dictionary to convert + + Returns + ------- + dict[Enum, int] + The dictionary with Enum keys + """ + dict_out: Dict[Enum, int] = {} + for k, v in dict_in.items(): + if isinstance(k, str): + dict_out[ExtendedCapabilityEnum(int(k))] = v + return dict_out + + @field_validator("capability_pointers", mode="before") + @classmethod + def str_to_enum(cls, dict_in: Dict[str, int]) -> Dict[Enum, int]: + """Converts a dictionary with string keys to Enum keys + + Parameters + ---------- + dict_in : Dict[str, int] + The dictionary to convert + + Returns + ------- + dict[Enum, int] + The dictionary with Enum keys + """ + dict_out: Dict[Enum, int] = {} + for k, v in dict_in.items(): + if isinstance(k, str): + dict_out[CapabilityEnum(int(k))] = v + else: + dict_out[k] = v + return dict_out + + @field_validator("cap_structure", mode="before") + @classmethod + def validate_cap_structure( + cls, cap_in: Dict[Union[int, str, CapabilityEnum], SerializeAsAny[PcieCapStructure]] + ) -> Dict[CapabilityEnum, PcieCapStructure]: + """This adjust's a generic PcieCapStructure dict into a specific PcieCapStructure and therefore populating all registers and fields""" + return cls.conform_json_dict_to_cap_struct(cap_in, CapabilityEnum) # type: ignore[arg-type, return-value] + + @field_validator("ecap_structure", mode="before") + @classmethod + def validate_ecap_structure( + cls, + ecap_in: Dict[Union[int, str, ExtendedCapabilityEnum], SerializeAsAny[PcieCapStructure]], + ) -> Dict[ExtendedCapabilityEnum, PcieCapStructure]: + """This adjust's a generic PcieCapStructure dict into a specific PcieCapStructure and therefore populating all registers and fields""" + return cls.conform_json_dict_to_cap_struct(ecap_in, ExtendedCapabilityEnum) # type: ignore[arg-type, return-value] + + @classmethod + def conform_json_dict_to_cap_struct( + cls, + cap_structure_in: Dict[Union[str, int, Enum], PcieCapStructure], + enum_type: type[Enum], + ) -> Dict[Enum, PcieCapStructure]: + """This is needed for when the model is loaded from a json/dict. Since the type of PcieCapStructure + does not fully describe which cap structure it is and which registers it has, pydantic just assumes + it is the base class. To override this behaviour the cap_id is used to discover which structure it + really should be. This is only done if the value of the validated attribute is a dict + + Parameters + ---------- + cap_structure_in : Dict[Union[str, int, Enum], PcieCapStructure] + A capability structure to fix from json input + enum_type : type[Enum] + Which enum to use for values + + Returns + ------- + dict[Enum, PcieCapStructure] + A dict where the values are now the fully defined structure instead of the base class + """ + cap_out: Dict[Enum, PcieCapStructure] = {} + for k, v in cap_structure_in.items(): + if isinstance(v, dict): + if isinstance(k, str): + enum = enum_type(int(k)) + elif isinstance(k, enum_type): + enum = k + cls = cap_id_to_class(enum) + cap_out[enum] = cls(**v) + else: + cap_out[k] = v # type: ignore[index] + return cap_out + + +class PcieDataModel(DataModel): + """class for collection of PCIe data. + + Optionals are used to allow for the data to be missing, + This makes the data class more flexible for the analyzer + which consumes only the required data. If any more data is + required for the analyzer then they should not be set to + default. + + - pcie_cfg_space: A dictionary of PCIe cfg space for the GPUs obtained with setpci command + - lspci_verbose: Verbose collection of PCIe data + - lspci_verbose_tree: Tree view of PCIe data + - lspci_path: Path view of PCIe data for the GPUs + - lspci_hex: Hex view of PCIe data for the GPUs + + """ + + pcie_cfg_space: Dict[BdfStr, PcieCfgSpace] + vf_pcie_cfg_space: Optional[Dict[BdfStr, PcieCfgSpace]] = None diff --git a/nodescraper/plugins/inband/pcie/pcie_plugin.py b/nodescraper/plugins/inband/pcie/pcie_plugin.py new file mode 100644 index 00000000..0e4f3eb0 --- /dev/null +++ b/nodescraper/plugins/inband/pcie/pcie_plugin.py @@ -0,0 +1,43 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import InBandDataPlugin + +from .analyzer_args import PcieAnalyzerArgs +from .pcie_analyzer import PcieAnalyzer +from .pcie_collector import PcieCollector +from .pcie_data import PcieDataModel + + +class PciePlugin(InBandDataPlugin[PcieDataModel, None, PcieAnalyzerArgs]): + """Plugin for collection and analysis of PCIe data""" + + DATA_MODEL = PcieDataModel + + COLLECTOR = PcieCollector + + ANALYZER = PcieAnalyzer + + ANALYZER_ARGS = PcieAnalyzerArgs diff --git a/nodescraper/utils.py b/nodescraper/utils.py index c76470de..de3a0956 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -23,11 +23,12 @@ # SOFTWARE. # ############################################################################### +import inspect import os import re import traceback from enum import Enum -from typing import Any, TypeVar, Union, get_args, get_origin +from typing import Any, List, Optional, Set, Type, TypeVar, Union, get_args, get_origin T = TypeVar("T") @@ -247,6 +248,156 @@ def nice_rotated_name(path: str, stem: str, prefix: str = "rotated_") -> str: return f"{prefix}{middle}.log" +def apply_bit_mask(in_hex: str, bit_mask_hex: str) -> Optional[str]: + """Extracts bit offset from bit mask, applies the bit mask and offset. + + Args: + in_hex (str): Hexadecimal input + bit_mask (str): Hexadecimal bit mask + + Returns: + str: hexadecimal output after applying bit mask and offset + """ + if not is_hex(hex_in=in_hex) or not is_hex(hex_in=bit_mask_hex): + return None + in_dec = hex_to_int(in_hex) + bit_mask_dec = hex_to_int(bit_mask_hex) + bit_offset = get_bit_offset(bit_mask_hex) + if in_dec is None or bit_mask_dec is None or bit_offset is None: + return None + out_dec = (in_dec & bit_mask_dec) >> bit_offset + return hex(out_dec) + + +def apply_bit_mask_int(in_int: int, bit_mask_int: int) -> Optional[int]: + """Extracts bit offset from bit mask, applies the bit mask and offset. + + Args: + in_int (int): integer input + bit_mask_int (int): integer bit mask + + Returns: + int: integer output after applying bit mask and offset + """ + out_int = (in_int & bit_mask_int) >> get_bit_offset_int(bit_mask_int) + return out_int + + +def get_bit_offset_int(bit_mask: int) -> int: + """Extracts the bit offset from bit mask. + For ex, bit_mask = 0x0010 (hex) -> 0b00010000 (bin) + Returns bit offset of 4 (bit position of the "1") + + Args: + bit_mask (int): hex bit mask + + Returns: + int: bit offset + """ + bit_pos = 0 + while bit_mask > 0: + if bit_mask % 2 == 1: + return bit_pos + bit_mask = bit_mask >> 1 + bit_pos += 1 + + return 0 + + +def get_bit_offset(bit_mask: str) -> Optional[int]: + """Extracts the bit offset from bit mask. + For ex, bit_mask = "0010" (hex) -> 0b00010000 (bin) + Returns bit offset of 4 (bit position of the "1") + + Args: + bit_mask (str): hex bit mask + + Returns: + int: bit offset + """ + bit_mask_int = hex_to_int(bit_mask) + bit_pos = 0 + if bit_mask_int is None: + return None + while bit_mask_int > 0: + if bit_mask_int % 2 == 1: + return bit_pos + bit_mask_int = bit_mask_int >> 1 + bit_pos += 1 + + return 0 + + +def get_all_subclasses(cls: Type[T]) -> Set[Type[T]]: + """Get an iterable with all subclasses of this class (not including this class) + Subclasses are presented in no particular order + + Returns: + An iterable of all subclasses of this class + """ + subclasses: Set[Type[T]] = set() + for subclass in cls.__subclasses__(): + subclasses = subclasses.union(get_all_subclasses(subclass)) + if not inspect.isabstract(subclass): + subclasses.add(subclass) + return subclasses + + +def get_subclass( + class_name: str, class_type: Type[T], sub_classes: Optional[List[Type[T]]] +) -> Optional[Type[T]]: + """get a subclass with a given name + + Args: + class_name (str): target sub class name + class_type (Type[T]): class type + sub_classes (Optional[List[Type[T]]]): list of sub classes to check + + Returns: + Optional[Type[T]]: sub class or None if no sub class with target name is found + """ + if not sub_classes: + sub_classes = list(get_all_subclasses(class_type)) + + for sub_class in sub_classes: + if sub_class.__name__ == class_name: + return sub_class + return None + + +def hex_to_int(hex_in: str) -> Optional[int]: + """Converts given hex string to int + + Args: + hex_in: hexadecimal string + + Returns: + int: hexadecimal converted to int + """ + try: + if not is_hex(hex_in): + return None + return int(hex_in, 16) + except TypeError: + return None + + +def is_hex(hex_in: str) -> bool: + """Returns True or False based on whether the input hexadecimal is indeed hexadecimal + + Args: + hex_in: hexadecimal string + + Returns: + bool: True/False whether the input hexadecimal is indeed hexadecimal + """ + if not hex_in: + return False + + hex_pattern = re.compile(r"^(0x)?[0-9a-fA-F]+$") + return bool(hex_pattern.fullmatch(hex_in)) + + def strip_ansi_codes(text: str) -> str: """ Remove ANSI escape codes from text. diff --git a/test/functional/fixtures/pcie_plugin_advanced_config.json b/test/functional/fixtures/pcie_plugin_advanced_config.json new file mode 100644 index 00000000..54812949 --- /dev/null +++ b/test/functional/fixtures/pcie_plugin_advanced_config.json @@ -0,0 +1,28 @@ +{ + "global_args": {}, + "plugins": { + "PciePlugin": { + "analysis_args": { + "exp_speed": 5, + "exp_width": 16, + "exp_sriov_count": 8, + "exp_gpu_count_override": 4, + "exp_max_payload_size": { + "29631": 256, + "29711": 512 + }, + "exp_max_rd_req_size": { + "29631": 512, + "29711": 1024 + }, + "exp_ten_bit_tag_req_en": { + "29631": 1, + "29711": 0 + } + } + } + }, + "result_collators": {}, + "name": "PciePlugin advanced config", + "desc": "Advanced config for testing PciePlugin with device-specific settings" +} diff --git a/test/functional/fixtures/pcie_plugin_config.json b/test/functional/fixtures/pcie_plugin_config.json new file mode 100644 index 00000000..cc78167e --- /dev/null +++ b/test/functional/fixtures/pcie_plugin_config.json @@ -0,0 +1,19 @@ +{ + "global_args": {}, + "plugins": { + "PciePlugin": { + "analysis_args": { + "exp_speed": 5, + "exp_width": 16, + "exp_sriov_count": 8, + "exp_gpu_count_override": 4, + "exp_max_payload_size": 256, + "exp_max_rd_req_size": 512, + "exp_ten_bit_tag_req_en": 1 + } + } + }, + "result_collators": {}, + "name": "PciePlugin config", + "desc": "Config for testing PciePlugin" +} diff --git a/test/functional/test_pcie_plugin.py b/test/functional/test_pcie_plugin.py new file mode 100644 index 00000000..9d6c70c9 --- /dev/null +++ b/test/functional/test_pcie_plugin.py @@ -0,0 +1,148 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Functional tests for PciePlugin with --plugin-configs.""" + +from pathlib import Path + +import pytest + + +@pytest.fixture +def fixtures_dir(): + """Return path to fixtures directory.""" + return Path(__file__).parent / "fixtures" + + +@pytest.fixture +def pcie_config_file(fixtures_dir): + """Return path to PciePlugin config file.""" + return fixtures_dir / "pcie_plugin_config.json" + + +@pytest.fixture +def pcie_advanced_config_file(fixtures_dir): + """Return path to PciePlugin advanced config file.""" + return fixtures_dir / "pcie_plugin_advanced_config.json" + + +def test_pcie_plugin_with_basic_config(run_cli_command, pcie_config_file, tmp_path): + """Test PciePlugin using basic config file with integer values.""" + assert pcie_config_file.exists(), f"Config file not found: {pcie_config_file}" + + log_path = str(tmp_path / "logs_pcie_basic") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(pcie_config_file)], check=False + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + assert "pcieplugin" in output.lower() or "pcie" in output.lower() + + +def test_pcie_plugin_with_advanced_config(run_cli_command, pcie_advanced_config_file, tmp_path): + """Test PciePlugin using advanced config with device-specific settings.""" + assert pcie_advanced_config_file.exists(), f"Config file not found: {pcie_advanced_config_file}" + + log_path = str(tmp_path / "logs_pcie_advanced") + result = run_cli_command( + ["--log-path", log_path, "--plugin-configs", str(pcie_advanced_config_file)], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_with_run_plugins_subcommand(run_cli_command, tmp_path): + """Test PciePlugin using run-plugins subcommand.""" + log_path = str(tmp_path / "logs_pcie_subcommand") + result = run_cli_command(["--log-path", log_path, "run-plugins", "PciePlugin"], check=False) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_with_passive_interaction(run_cli_command, pcie_config_file, tmp_path): + """Test PciePlugin with PASSIVE system interaction level.""" + log_path = str(tmp_path / "logs_pcie_passive") + result = run_cli_command( + [ + "--log-path", + log_path, + "--sys-interaction-level", + "PASSIVE", + "--plugin-configs", + str(pcie_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_skip_sudo(run_cli_command, pcie_config_file, tmp_path): + """Test PciePlugin with --skip-sudo flag.""" + log_path = str(tmp_path / "logs_pcie_no_sudo") + result = run_cli_command( + [ + "--log-path", + log_path, + "--skip-sudo", + "--plugin-configs", + str(pcie_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 + + +def test_pcie_plugin_combined_configs( + run_cli_command, pcie_config_file, pcie_advanced_config_file, tmp_path +): + """Test PciePlugin with multiple config files.""" + log_path = str(tmp_path / "logs_pcie_combined") + result = run_cli_command( + [ + "--log-path", + log_path, + "--plugin-configs", + str(pcie_config_file), + str(pcie_advanced_config_file), + ], + check=False, + ) + + assert result.returncode in [0, 1, 2] + output = result.stdout + result.stderr + assert len(output) > 0 diff --git a/test/unit/plugin/test_sysctl_analyzer.py b/test/unit/plugin/test_sysctl_analyzer.py index ffc943fe..e788c491 100644 --- a/test/unit/plugin/test_sysctl_analyzer.py +++ b/test/unit/plugin/test_sysctl_analyzer.py @@ -38,4 +38,6 @@ def test_analyzer_mismatch(analyzer, correct_data): args = SysctlAnalyzerArgs(exp_vm_swappiness=3, exp_vm_numa_balancing=4) result = analyzer.analyze_data(correct_data, args) assert result.status == ExecutionStatus.ERROR - assert "2 sysctl parameter(s) mismatched. (1 errors)" in result.message + assert "2 sysctl parameter(s) mismatched." in result.message + assert "1 errors" in result.message + assert "Sysctl mismatch detected" in result.message