-
Notifications
You must be signed in to change notification settings - Fork 284
autoresume state handling improvements #2196
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
matthewlouisbrockman
merged 30 commits into
main
from
connect-autoresume-state-handling
Mar 25, 2026
Merged
Changes from all commits
Commits
Show all changes
30 commits
Select commit
Hold shift + click to select a range
5e8ee0c
autoresume handles state changes like /resume
matthewlouisbrockman ab85a92
api: re-check orchestrator state after pausing wait
matthewlouisbrockman 94ecdab
api: log missing routing info on auto-resume
matthewlouisbrockman f7f306a
api: cover proxy auto-resume state handling
matthewlouisbrockman e138219
api: clean up proxy auto-resume fallback
matthewlouisbrockman aa2c473
api: tighten proxy auto-resume state refresh
matthewlouisbrockman d89585c
proxy: surface snapshot in progress during auto-resume
matthewlouisbrockman 5ddc2bf
api: wait through snapshotting during auto-resume
matthewlouisbrockman f3f04d5
proxy: drop snapshot-specific auto-resume handling
matthewlouisbrockman a3de6df
can keep it all in the api
matthewlouisbrockman bcd451e
api: bound proxy auto-resume retries and reload snapshots on fallback
matthewlouisbrockman d5d236e
don't need shouldReloadSnapshot anymore
matthewlouisbrockman e1b408f
inline handleInitialSandboxAutoResumeLookupError
matthewlouisbrockman 964c632
go with failed precondition when the sandbox is in a transitioning state
matthewlouisbrockman 99c7358
ensure timeout waiting on transition (e.g. case paused -> running can…
matthewlouisbrockman 6f1775e
add comment with intended behavior on different states
matthewlouisbrockman 00719b4
can go with 1 minute grpc transition timeout - looks like 75s timeout…
matthewlouisbrockman 93259a3
move the getAutoResume snapshot outside the if/else
matthewlouisbrockman 1eb05e3
pass a still transitiion message to forward proof transitioning handl…
matthewlouisbrockman f2a491d
missed import
matthewlouisbrockman 2a3bddf
move HandleExistingSandboxAutoResume into api/internal/orchestrator
matthewlouisbrockman 2eb2308
normalize codes to keep grpc logic out of the orchestrator
matthewlouisbrockman d82d857
move SandboxStillTransitioningMessage from metadata consts to status …
matthewlouisbrockman d718341
clean up the HandleExistingSandboxAutoResume in resume.go and move th…
matthewlouisbrockman 7dc12ca
Merge branch 'main' into connect-autoresume-state-handling
matthewlouisbrockman fcdee7c
Merge branch 'main' into connect-autoresume-state-handling again
matthewlouisbrockman 9612f09
use getOrConnectNode
matthewlouisbrockman c89c8a3
tests updated for new sandbox.callbacks
matthewlouisbrockman 306a68f
serialize fixed-port fake gRPC servers in orchestrator tests to preve…
matthewlouisbrockman 724c237
don't need the missing node test at all
matthewlouisbrockman File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,123 @@ | ||
| package orchestrator | ||
|
|
||
| import ( | ||
| "context" | ||
| "errors" | ||
| "fmt" | ||
| "time" | ||
|
|
||
| "github.com/google/uuid" | ||
| "go.uber.org/zap" | ||
|
|
||
| apisandbox "github.com/e2b-dev/infra/packages/api/internal/sandbox" | ||
| sharedproxygrpc "github.com/e2b-dev/infra/packages/shared/pkg/grpc/proxy" | ||
| "github.com/e2b-dev/infra/packages/shared/pkg/logger" | ||
| ) | ||
|
|
||
| const MaxAutoResumeTransitionRetries = 3 | ||
|
|
||
| var ErrSandboxStillTransitioning = errors.New(sharedproxygrpc.SandboxStillTransitioningMessage) | ||
|
|
||
| func (o *Orchestrator) HandleExistingSandboxAutoResume( | ||
| ctx context.Context, | ||
| teamID uuid.UUID, | ||
| sandboxID string, | ||
| sbx apisandbox.Sandbox, | ||
| transitionWaitBudget time.Duration, | ||
| ) (string, bool, error) { | ||
| transitionCtx, cancel := context.WithTimeout(ctx, transitionWaitBudget) | ||
| defer cancel() | ||
|
|
||
| attempts := 0 | ||
|
|
||
| // Existing sandbox auto-resume state handling: | ||
| // - running: return the current node IP immediately | ||
| // - pausing/snapshotting: wait for the transition, refresh state, and retry | ||
| // - killing: treat as not found | ||
| // - anything else: return internal error | ||
| // - internal transition wait timeout: treat as "still transitioning" | ||
| // - caller cancellation/deadline: propagate the context error | ||
| for { | ||
| switch sbx.State { | ||
| case apisandbox.StatePausing, apisandbox.StateSnapshotting: | ||
| if attempts >= MaxAutoResumeTransitionRetries { | ||
| logger.L().Warn( | ||
| ctx, | ||
| "Sandbox is still transitioning after auto-resume retries", | ||
| logger.WithSandboxID(sandboxID), | ||
| zap.String("state", string(sbx.State)), | ||
| zap.Int("attempts", attempts), | ||
| ) | ||
|
|
||
| return "", false, ErrSandboxStillTransitioning | ||
| } | ||
|
|
||
| attempts++ | ||
| waitErrMsg := "error waiting for sandbox to pause" | ||
| if sbx.State == apisandbox.StatePausing { | ||
| logger.L().Debug(ctx, "Waiting for sandbox to pause before auto-resume", logger.WithSandboxID(sandboxID), zap.Int("attempt", attempts)) | ||
| } else { | ||
| waitErrMsg = "error waiting for sandbox snapshot to finish" | ||
| logger.L().Debug(ctx, "Waiting for sandbox snapshot to finish before auto-resume", logger.WithSandboxID(sandboxID), zap.Int("attempt", attempts)) | ||
| } | ||
|
|
||
| err := o.WaitForStateChange(transitionCtx, teamID, sandboxID) | ||
| if err != nil { | ||
| if errors.Is(transitionCtx.Err(), context.DeadlineExceeded) && ctx.Err() == nil { | ||
| logger.L().Warn( | ||
| ctx, | ||
| "Sandbox transition wait timed out during auto-resume", | ||
| logger.WithSandboxID(sandboxID), | ||
| zap.String("state", string(sbx.State)), | ||
| zap.Int("attempt", attempts), | ||
| zap.Duration("budget", transitionWaitBudget), | ||
| ) | ||
|
|
||
| return "", false, ErrSandboxStillTransitioning | ||
| } | ||
|
|
||
| if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { | ||
| return "", false, err | ||
| } | ||
|
|
||
| return "", false, errors.New(waitErrMsg) | ||
| } | ||
|
|
||
| updatedSandbox, getSandboxErr := o.GetSandbox(ctx, teamID, sandboxID) | ||
| if getSandboxErr == nil { | ||
| sbx = updatedSandbox | ||
|
|
||
| continue | ||
| } | ||
| if errors.Is(getSandboxErr, apisandbox.ErrNotFound) { | ||
| return "", false, nil | ||
| } | ||
|
|
||
| return "", false, fmt.Errorf("failed to refresh sandbox state: %w", getSandboxErr) | ||
| case apisandbox.StateKilling: | ||
| logger.L().Debug(ctx, "Sandbox is being killed, cannot auto-resume", logger.WithSandboxID(sandboxID)) | ||
|
|
||
| return "", false, apisandbox.ErrNotFound | ||
| case apisandbox.StateRunning: | ||
| node := o.getOrConnectNode(ctx, sbx.ClusterID, sbx.NodeID) | ||
| if node == nil { | ||
| logger.L().Error( | ||
| ctx, | ||
| "Sandbox is running but routing info is not available during auto-resume", | ||
| logger.WithSandboxID(sandboxID), | ||
| logger.WithTeamID(teamID.String()), | ||
| logger.WithNodeID(sbx.NodeID), | ||
| zap.Stringer("cluster_id", sbx.ClusterID), | ||
| ) | ||
|
|
||
| return "", false, errors.New("sandbox is running but routing info is not available yet") | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| } | ||
|
|
||
| return node.IPAddress, true, nil | ||
| default: | ||
| logger.L().Error(ctx, "Sandbox is in an unknown state during auto-resume", logger.WithSandboxID(sandboxID), zap.String("state", string(sbx.State))) | ||
|
|
||
| return "", false, errors.New("sandbox is in an unknown state") | ||
| } | ||
| } | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.