|
256 | 256 | > "${ARTIFACT_DIR}/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr" |
257 | 257 | done |
258 | 258 |
|
| 259 | +echo "" |
| 260 | +echo "#" |
| 261 | +echo "# Kubernetes Events (operator namespace)" |
| 262 | +echo "#" |
| 263 | +echo "" |
| 264 | + |
| 265 | +echo "Get events in ${OPERATOR_NAMESPACE} (sorted by last timestamp)" |
| 266 | +$K get events \ |
| 267 | + -n "${OPERATOR_NAMESPACE}" \ |
| 268 | + --sort-by='.lastTimestamp' \ |
| 269 | + > "${ARTIFACT_DIR}/events_operator_namespace.log" 2>&1 || true |
| 270 | + |
| 271 | +echo "" |
| 272 | +echo "#" |
| 273 | +echo "# GPU Node Upgrade State" |
| 274 | +echo "#" |
| 275 | +echo "" |
| 276 | + |
| 277 | +echo "Get upgrade-related annotations and labels for GPU nodes" |
| 278 | +for node in $(echo "$gpu_pci_nodes"); do |
| 279 | + node_name=$(echo "${node}" | cut -d/ -f2) |
| 280 | + echo "=== ${node_name} ===" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 281 | + |
| 282 | + echo "# Upgrade annotations:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 283 | + $K get "${node}" -ojsonpath='{.metadata.annotations}' 2>/dev/null \ |
| 284 | + | tr ',' '\n' \ |
| 285 | + | grep -E 'nvidia.com/gpu-driver' \ |
| 286 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" 2>/dev/null || echo " (none)" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 287 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 288 | + |
| 289 | + echo "# Upgrade state label:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 290 | + $K get "${node}" -ojsonpath='{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}' 2>/dev/null \ |
| 291 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 292 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 293 | + |
| 294 | + echo "# Node conditions (Ready, SchedulingDisabled, etc.):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 295 | + $K get "${node}" -o jsonpath='{range .status.conditions[*]}{.type}={.status} {end}' 2>/dev/null \ |
| 296 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 297 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 298 | + |
| 299 | + echo "# Unschedulable:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 300 | + $K get "${node}" -ojsonpath='{.spec.unschedulable}' 2>/dev/null \ |
| 301 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 302 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 303 | + |
| 304 | + echo "# Events on node (upgrade-related):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 305 | + $K get events --field-selector "involvedObject.name=${node_name},involvedObject.kind=Node" \ |
| 306 | + --sort-by='.lastTimestamp' \ |
| 307 | + 2>/dev/null \ |
| 308 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 309 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 310 | +done |
| 311 | + |
| 312 | +echo "" |
| 313 | +echo "#" |
| 314 | +echo "# Controller Revisions (driver DaemonSets)" |
| 315 | +echo "#" |
| 316 | +echo "" |
| 317 | + |
| 318 | +echo "Get controller revisions in ${OPERATOR_NAMESPACE}" |
| 319 | +$K get controllerrevisions \ |
| 320 | + -n "${OPERATOR_NAMESPACE}" \ |
| 321 | + --sort-by='.revision' \ |
| 322 | + > "${ARTIFACT_DIR}/controller_revisions.log" 2>&1 || true |
| 323 | + |
| 324 | +echo "Get controller revision details (driver vs other operands)" |
| 325 | +for cr in $($K get controllerrevisions -n "${OPERATOR_NAMESPACE}" -oname 2>/dev/null); do |
| 326 | + cr_owner=$($K get "${cr}" -n "${OPERATOR_NAMESPACE}" -ojsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || true) |
| 327 | + if echo "${cr_owner}" | grep -qi 'driver'; then |
| 328 | + $K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \ |
| 329 | + >> "${ARTIFACT_DIR}/controller_revisions_driver.yaml" 2>&1 || true |
| 330 | + echo "---" >> "${ARTIFACT_DIR}/controller_revisions_driver.yaml" |
| 331 | + else |
| 332 | + $K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \ |
| 333 | + >> "${ARTIFACT_DIR}/controller_revisions_other.yaml" 2>&1 || true |
| 334 | + echo "---" >> "${ARTIFACT_DIR}/controller_revisions_other.yaml" |
| 335 | + fi |
| 336 | +done |
| 337 | + |
259 | 338 | echo "" |
260 | 339 | echo "#" |
261 | 340 | echo "# nvidia-bug-report.sh" |
|
0 commit comments