Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions nodescraper/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ def process_args(
plugin_arg_index = -1

plugin_arg_map = {}
invalid_plugins = []
if plugin_arg_index != -1 and plugin_arg_index != len(raw_arg_input) - 1:
top_level_args = raw_arg_input[: plugin_arg_index + 1]
plugin_args = raw_arg_input[plugin_arg_index + 1 :]
Expand All @@ -344,12 +345,26 @@ def process_args(
else:
cur_plugin = None
for arg in plugin_args:
if arg in plugin_names:
# Handle comma-separated plugin names (but not arguments)
if not arg.startswith("-") and "," in arg:
# Split comma-separated plugin names
for potential_plugin in arg.split(","):
potential_plugin = potential_plugin.strip()
if potential_plugin in plugin_names:
plugin_arg_map[potential_plugin] = []
cur_plugin = potential_plugin
elif potential_plugin:
# Track invalid plugin names to log event later
invalid_plugins.append(potential_plugin)
elif arg in plugin_names:
plugin_arg_map[arg] = []
cur_plugin = arg
elif cur_plugin:
plugin_arg_map[cur_plugin].append(arg)
return (top_level_args, plugin_arg_map)
elif not arg.startswith("-"):
# Track invalid plugin names to log event later
invalid_plugins.append(arg)
return (top_level_args, plugin_arg_map, invalid_plugins)


def main(arg_input: Optional[list[str]] = None):
Expand All @@ -367,7 +382,9 @@ def main(arg_input: Optional[list[str]] = None):
parser, plugin_subparser_map = build_parser(plugin_reg, config_reg)

try:
top_level_args, plugin_arg_map = process_args(arg_input, list(plugin_subparser_map.keys()))
top_level_args, plugin_arg_map, invalid_plugins = process_args(
arg_input, list(plugin_subparser_map.keys())
)

parsed_args = parser.parse_args(top_level_args)
system_info = get_system_info(parsed_args)
Expand All @@ -387,6 +404,13 @@ def main(arg_input: Optional[list[str]] = None):
if log_path:
logger.info("Log path: %s", log_path)

# Log warning if invalid plugin names were provided
if invalid_plugins:
logger.warning(
"Invalid plugin name(s) ignored: %s. Use 'describe plugin' to list available plugins.",
", ".join(invalid_plugins),
)

if parsed_args.subcmd == "summary":
generate_summary(parsed_args.search_path, parsed_args.output_path, logger)
sys.exit(0)
Expand Down
6 changes: 6 additions & 0 deletions nodescraper/pluginexecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,12 @@ def run_queue(self) -> list[PluginResult]:
global_run_args = self.apply_global_args_to_plugin(
plugin_inst, plugin_class, self.plugin_config.global_args
)
# Merge analysis_args and collection_args
for args_key in ["analysis_args", "collection_args"]:
if args_key in global_run_args and args_key in run_payload:
# Merge: global args override plugin-specific args keys specified in both global and plugin-specific args
run_payload[args_key].update(global_run_args[args_key])
del global_run_args[args_key]
run_payload.update(global_run_args)
except ValueError as ve:
self.logger.error(
Expand Down
133 changes: 123 additions & 10 deletions nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
# SOFTWARE.
#
###############################################################################
import io
from collections import defaultdict
from typing import Any, Dict, List, Optional, Union
from typing import Any, Optional, Union

from nodescraper.enums import EventCategory, EventPriority
from nodescraper.interfaces import DataAnalyzer
Expand All @@ -34,16 +35,19 @@
AmdSmiDataModel,
AmdSmiMetric,
AmdSmiStatic,
AmdSmiTstData,
EccData,
Fw,
Partition,
Processes,
XgmiMetrics,
)
from .analyzer_args import AmdSmiAnalyzerArgs
from .cper import CperAnalysisTaskMixin


class AmdSmiAnalyzer(DataAnalyzer[AmdSmiDataModel, None]):
""""""
class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]):
"""Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics"""

DATA_MODEL = AmdSmiDataModel

Expand Down Expand Up @@ -441,7 +445,7 @@ def check_static_data(

mismatches: list[tuple[int, str, str, str]] = []

expected_data: Dict[str, Optional[str]] = {
expected_data: dict[str, Optional[str]] = {
"vendor_id": vendor_id,
"subvendor_id": subvendor_id,
"vendor_name": "Advanced Micro Devices Inc",
Expand Down Expand Up @@ -500,24 +504,24 @@ def check_static_data(

def _format_static_mismatch_payload(
self,
mismatches: List[tuple[int, str, str, str]],
) -> Dict[str, Any]:
mismatches: list[tuple[int, str, str, str]],
) -> dict[str, Any]:
"""Helper function for pretty printing mismatch in expected data

Args:
mismatches (List[tuple[int, str, str, str]]): mismatched data per GPU
mismatches (list[tuple[int, str, str, str]]): mismatched data per GPU

Returns:
Dict[str, Any]: dict of mismatched data per GPU
dict[str, Any]: dict of mismatched data per GPU
"""
per_gpu: Dict[int, List[Dict[str, str]]] = defaultdict(list)
per_gpu: dict[int, list[dict[str, str]]] = defaultdict(list)
field_set: set[str] = set()

for gpu, field, expected, actual in mismatches:
field_set.add(field)
per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual})

per_gpu_list: List[Dict[str, Any]] = [
per_gpu_list: list[dict[str, Any]] = [
{"gpu": gpu, "mismatches": entries}
for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0])
]
Expand Down Expand Up @@ -635,6 +639,97 @@ def check_expected_memory_partition_mode(
},
)

def check_expected_xgmi_link_speed(
self,
xgmi_metric: Optional[list[XgmiMetrics]],
expected_xgmi_speed: Optional[list[float]] = None,
):
"""Check the XGMI link speed for all GPUs

Args:
xgmi_metric (Optional[list[XgmiMetrics]]): XGMI metrics data
expected_xgmi_speed (Optional[list[float]]): List of expected XGMI speeds (GT/s)
"""
if xgmi_metric is None or len(xgmi_metric) == 0:
self._log_event(
category=EventCategory.IO,
description="XGMI link speed data is not available and cannot be checked",
priority=EventPriority.WARNING,
data={"xgmi_metric": xgmi_metric},
)
return

if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0:
self._log_event(
category=EventCategory.IO,
description="Expected XGMI speed not configured, skipping XGMI link speed check",
priority=EventPriority.WARNING,
)
return

for xgmi_data in xgmi_metric:
link_metric = xgmi_data.link_metrics
try:
if link_metric.bit_rate is None or link_metric.bit_rate.value is None:
self._log_event(
category=EventCategory.IO,
description="XGMI link speed is not available",
priority=EventPriority.ERROR,
data={
"gpu": xgmi_data.gpu,
"xgmi_bit_rate": (
link_metric.bit_rate.unit if link_metric.bit_rate else "N/A"
),
},
)
continue

xgmi_float = float(link_metric.bit_rate.value)
except ValueError:
self._log_event(
category=EventCategory.IO,
description="XGMI link speed is not a valid number",
priority=EventPriority.ERROR,
data={
"gpu": xgmi_data.gpu,
"xgmi_bit_rate": (
link_metric.bit_rate.value if link_metric.bit_rate else "N/A"
),
},
)
continue

if xgmi_float not in expected_xgmi_speed:
self._log_event(
category=EventCategory.IO,
description="XGMI link speed is not as expected",
priority=EventPriority.ERROR,
data={
"gpu": xgmi_data.gpu,
"xgmi_bit_rate": xgmi_float,
"expected_xgmi_speed": expected_xgmi_speed,
},
console_log=True,
)

def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData):
"""Check AMD SMI test results

Args:
amdsmitst_data (AmdSmiTstData): AMD SMI test data
"""
if amdsmitst_data.failed_test_count > 0:
self._log_event(
category=EventCategory.APPLICATION,
description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst",
priority=EventPriority.ERROR,
data={
"failed_test_count": amdsmitst_data.failed_test_count,
"failed_tests": amdsmitst_data.failed_tests,
},
console_log=True,
)

def analyze_data(
self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None
) -> TaskResult:
Expand Down Expand Up @@ -705,4 +800,22 @@ def analyze_data(
if args.expected_pldm_version:
self.check_pldm_version(data.firmware, args.expected_pldm_version)

if data.cper_data:
self.analyzer_cpers(
{
file_model_obj.file_name: io.BytesIO(file_model_obj.file_contents)
for file_model_obj in data.cper_data
},
analysis_range_start=args.analysis_range_start,
analysis_range_end=args.analysis_range_end,
)

if data.xgmi_metric and len(data.xgmi_metric) > 0:
self.check_expected_xgmi_link_speed(
data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed
)

if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0:
self.check_amdsmitst(data.amdsmitst_data)

return self.result
Loading