Skip to content

Commit 163bf6b

Browse files
Collect events and upgrade state in must-gather.sh
1 parent dcde038 commit 163bf6b

File tree

1 file changed

+79
-0
lines changed

1 file changed

+79
-0
lines changed

hack/must-gather.sh

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,85 @@ do
256256
> "${ARTIFACT_DIR}/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr"
257257
done
258258

259+
echo ""
260+
echo "#"
261+
echo "# Kubernetes Events (operator namespace)"
262+
echo "#"
263+
echo ""
264+
265+
echo "Get events in ${OPERATOR_NAMESPACE} (sorted by last timestamp)"
266+
$K get events \
267+
-n "${OPERATOR_NAMESPACE}" \
268+
--sort-by='.lastTimestamp' \
269+
> "${ARTIFACT_DIR}/events_operator_namespace.log" 2>&1 || true
270+
271+
echo ""
272+
echo "#"
273+
echo "# GPU Node Upgrade State"
274+
echo "#"
275+
echo ""
276+
277+
echo "Get upgrade-related annotations and labels for GPU nodes"
278+
for node in $(echo "$gpu_pci_nodes"); do
279+
node_name=$(echo "${node}" | cut -d/ -f2)
280+
echo "=== ${node_name} ===" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
281+
282+
echo "# Upgrade annotations:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
283+
$K get "${node}" -ojsonpath='{.metadata.annotations}' 2>/dev/null \
284+
| tr ',' '\n' \
285+
| grep -E 'nvidia.com/gpu-driver' \
286+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" 2>/dev/null || echo " (none)" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
287+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
288+
289+
echo "# Upgrade state label:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
290+
$K get "${node}" -ojsonpath='{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}' 2>/dev/null \
291+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
292+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
293+
294+
echo "# Node conditions (Ready, SchedulingDisabled, etc.):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
295+
$K get "${node}" -o jsonpath='{range .status.conditions[*]}{.type}={.status} {end}' 2>/dev/null \
296+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
297+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
298+
299+
echo "# Unschedulable:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
300+
$K get "${node}" -ojsonpath='{.spec.unschedulable}' 2>/dev/null \
301+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
302+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
303+
304+
echo "# Events on node (upgrade-related):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
305+
$K get events --field-selector "involvedObject.name=${node_name},involvedObject.kind=Node" \
306+
--sort-by='.lastTimestamp' \
307+
2>/dev/null \
308+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
309+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
310+
done
311+
312+
echo ""
313+
echo "#"
314+
echo "# Controller Revisions (driver DaemonSets)"
315+
echo "#"
316+
echo ""
317+
318+
echo "Get controller revisions in ${OPERATOR_NAMESPACE}"
319+
$K get controllerrevisions \
320+
-n "${OPERATOR_NAMESPACE}" \
321+
--sort-by='.revision' \
322+
> "${ARTIFACT_DIR}/controller_revisions.log" 2>&1 || true
323+
324+
echo "Get controller revision details (driver vs other operands)"
325+
for cr in $($K get controllerrevisions -n "${OPERATOR_NAMESPACE}" -oname 2>/dev/null); do
326+
cr_owner=$($K get "${cr}" -n "${OPERATOR_NAMESPACE}" -ojsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || true)
327+
if echo "${cr_owner}" | grep -qi 'driver'; then
328+
$K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \
329+
>> "${ARTIFACT_DIR}/controller_revisions_driver.yaml" 2>&1 || true
330+
echo "---" >> "${ARTIFACT_DIR}/controller_revisions_driver.yaml"
331+
else
332+
$K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \
333+
>> "${ARTIFACT_DIR}/controller_revisions_other.yaml" 2>&1 || true
334+
echo "---" >> "${ARTIFACT_DIR}/controller_revisions_other.yaml"
335+
fi
336+
done
337+
259338
echo ""
260339
echo "#"
261340
echo "# nvidia-bug-report.sh"

0 commit comments

Comments
 (0)