Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 40 additions & 34 deletions e2e/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
Expand Down Expand Up @@ -148,49 +149,54 @@ func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string,

func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string {
defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)()
var lastNode *corev1.Node

err := wait.PollUntilContextTimeout(ctx, 10*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) {
nodes, err := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Logf("error listing nodes: %v", err)
return false, nil
}

for i := range nodes.Items {
node := &nodes.Items[i]
if !strings.HasPrefix(node.Name, vmssName) {
continue
var lastNode *corev1.Node
for ctx.Err() == nil {
name := func() string {
watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{})
if err != nil {
Comment on lines 150 to +157
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR description says this was switched to ticker-based polling using List and that the function returns (string, error) instead of calling t.Fatalf, but the implementation still uses Watch and still calls t.Fatalf on failure/deleted events. Either update the implementation to match (List+polling, return error) or adjust the PR description to reflect the actual behavior.

Copilot uses AI. Check for mistakes.
Comment on lines +156 to +157
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WaitUntilNodeReady now relies solely on a watch stream. If the node is already Ready before the watch starts (or if it becomes Ready without subsequent status updates), this can block until the context deadline even though the node is ready. Consider doing an initial Nodes().List to detect an already-ready node, then starting the watch from the returned resourceVersion to avoid missing events between list and watch.

Suggested change
watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{})
if err != nil {
nodeList, err := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
t.Logf("failed to list nodes: %v, retrying in 5s", err)
select {
case <-ctx.Done():
case <-time.After(5 * time.Second):
}
return ""
}
for i := range nodeList.Items {
node := &nodeList.Items[i]
if !strings.HasPrefix(node.Name, vmssName) {
continue
}
lastNode = node
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
t.Logf("node %s is ready", node.Name)
return node.Name
}
}
}
watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{ResourceVersion: nodeList.ResourceVersion})
if err != nil {

Copilot uses AI. Check for mistakes.
t.Logf("failed to start node watch: %v, retrying in 5s", err)
select {
case <-ctx.Done():
case <-time.After(5 * time.Second):
}
return ""
}
defer watcher.Stop()

lastNode = node
nodeTaints, _ := json.Marshal(node.Spec.Taints)
nodeConditions, _ := json.Marshal(node.Status.Conditions)

for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
return true, nil
for event := range watcher.ResultChan() {
Comment on lines +156 to +167
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WaitUntilNodeReady starts a watch and then only reacts to future events. If the node is already present (or already Ready) when the watch begins, there may be no subsequent events and this can block until the context times out. Consider doing an initial Nodes().List/Nodes().Get check (or switching fully to PollUntilContextTimeout + List) to evaluate current state before watching/polling again.

Copilot uses AI. Check for mistakes.
if event.Type == watch.Error {
t.Logf("node watch error: %v", event.Object)
return ""
}
node, ok := event.Object.(*corev1.Node)
if !ok || !strings.HasPrefix(node.Name, vmssName) {
continue
}
if event.Type == watch.Deleted {
t.Fatalf("node %s was deleted", node.Name)
}
lastNode = node
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
t.Logf("node %s is ready", node.Name)
return node.Name
}
}
}

t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
}

return false, nil
})

if err != nil {
if lastNode == nil {
t.Fatalf("%q haven't appeared in k8s API server: %v", vmssName, err)
return ""
}()
Comment on lines +167 to +188
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When watcher.ResultChan() closes (e.g., apiserver closes the connection) or when a watch.Error event occurs, the code immediately re-establishes a new watch with no delay. This can spin in a tight loop and hammer the API server during outages/flaky connections. Add a small backoff (similar to the 5s delay used on watch start failures) before retrying after channel close/error events.

Copilot uses AI. Check for mistakes.
if name != "" {
return name
}
nodeString, _ := json.Marshal(lastNode)
t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, lastNode.Name, lastNode.Status, string(nodeString))
return ""
}

return lastNode.Name
if lastNode != nil {
nodeJSON, _ := json.Marshal(lastNode)
t.Fatalf("node %s (%s) not ready: %v\n%s", vmssName, lastNode.Name, ctx.Err(), nodeJSON)
}
t.Fatalf("node %q not found: %v", vmssName, ctx.Err())
return ""
}

// GetPodNetworkDebugPodForNode returns a pod that's a member of the 'debugnonhost' daemonset running in the cluster - this will return
Expand Down
5 changes: 1 addition & 4 deletions e2e/test_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,12 +331,9 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) {
require.NoError(s.T, err)

if !s.Config.SkipDefaultValidation {
vmssCreatedAt := time.Now() // Record the start time
creationElapse := time.Since(start) // Calculate the elapsed time
scenarioVM.KubeName = s.Runtime.Cluster.Kube.WaitUntilNodeReady(ctx, s.T, s.Runtime.VMSSName)
readyElapse := time.Since(vmssCreatedAt) // Calculate the elapsed time
totalElapse := time.Since(start)
toolkit.LogDuration(ctx, totalElapse, 3*time.Minute, fmt.Sprintf("Node %s took %s to be created and %s to be ready", s.Runtime.VMSSName, creationElapse, readyElapse))
toolkit.LogDuration(ctx, totalElapse, 3*time.Minute, fmt.Sprintf("Node %s took %s to be created and joined the cluster", s.Runtime.VMSSName, totalElapse))
}

return scenarioVM, nil
Expand Down
Loading