Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
5e8ee0c
autoresume handles state changes like /resume
matthewlouisbrockman Mar 21, 2026
ab85a92
api: re-check orchestrator state after pausing wait
matthewlouisbrockman Mar 22, 2026
94ecdab
api: log missing routing info on auto-resume
matthewlouisbrockman Mar 22, 2026
f7f306a
api: cover proxy auto-resume state handling
matthewlouisbrockman Mar 22, 2026
e138219
api: clean up proxy auto-resume fallback
matthewlouisbrockman Mar 22, 2026
aa2c473
api: tighten proxy auto-resume state refresh
matthewlouisbrockman Mar 22, 2026
d89585c
proxy: surface snapshot in progress during auto-resume
matthewlouisbrockman Mar 22, 2026
5ddc2bf
api: wait through snapshotting during auto-resume
matthewlouisbrockman Mar 22, 2026
f3f04d5
proxy: drop snapshot-specific auto-resume handling
matthewlouisbrockman Mar 22, 2026
a3de6df
can keep it all in the api
matthewlouisbrockman Mar 22, 2026
bcd451e
api: bound proxy auto-resume retries and reload snapshots on fallback
matthewlouisbrockman Mar 22, 2026
d5d236e
don't need shouldReloadSnapshot anymore
matthewlouisbrockman Mar 22, 2026
e1b408f
inline handleInitialSandboxAutoResumeLookupError
matthewlouisbrockman Mar 22, 2026
964c632
go with failed precondition when the sandbox is in a transitioning state
matthewlouisbrockman Mar 22, 2026
99c7358
ensure timeout waiting on transition (e.g. case paused -> running can…
matthewlouisbrockman Mar 22, 2026
6f1775e
add comment with intended behavior on different states
matthewlouisbrockman Mar 22, 2026
00719b4
can go with 1 minute grpc transition timeout - looks like 75s timeout…
matthewlouisbrockman Mar 22, 2026
93259a3
move the getAutoResume snapshot outside the if/else
matthewlouisbrockman Mar 23, 2026
1eb05e3
pass a still transitiion message to forward proof transitioning handl…
matthewlouisbrockman Mar 23, 2026
f2a491d
missed import
matthewlouisbrockman Mar 23, 2026
2a3bddf
move HandleExistingSandboxAutoResume into api/internal/orchestrator
matthewlouisbrockman Mar 23, 2026
2eb2308
normalize codes to keep grpc logic out of the orchestrator
matthewlouisbrockman Mar 23, 2026
d82d857
move SandboxStillTransitioningMessage from metadata consts to status …
matthewlouisbrockman Mar 24, 2026
d718341
clean up the HandleExistingSandboxAutoResume in resume.go and move th…
matthewlouisbrockman Mar 24, 2026
7dc12ca
Merge branch 'main' into connect-autoresume-state-handling
matthewlouisbrockman Mar 24, 2026
fcdee7c
Merge branch 'main' into connect-autoresume-state-handling again
matthewlouisbrockman Mar 24, 2026
9612f09
use getOrConnectNode
matthewlouisbrockman Mar 24, 2026
c89c8a3
tests updated for new sandbox.callbacks
matthewlouisbrockman Mar 24, 2026
306a68f
serialize fixed-port fake gRPC servers in orchestrator tests to preve…
matthewlouisbrockman Mar 24, 2026
724c237
don't need the missing node test at all
matthewlouisbrockman Mar 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 65 additions & 10 deletions packages/api/internal/handlers/proxy_grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ import (

snapshotcache "github.com/e2b-dev/infra/packages/api/internal/cache/snapshots"
dbapi "github.com/e2b-dev/infra/packages/api/internal/db"
apiorchestrator "github.com/e2b-dev/infra/packages/api/internal/orchestrator"
"github.com/e2b-dev/infra/packages/api/internal/sandbox"
"github.com/e2b-dev/infra/packages/api/internal/utils"
dbtypes "github.com/e2b-dev/infra/packages/db/pkg/types"
"github.com/e2b-dev/infra/packages/shared/pkg/consts"
Expand Down Expand Up @@ -85,36 +87,89 @@ func denyResumePermission() error {
return status.Error(codes.PermissionDenied, "permission denied")
}

func (s *SandboxService) ResumeSandbox(ctx context.Context, req *proxygrpc.SandboxResumeRequest) (*proxygrpc.SandboxResumeResponse, error) {
sandboxID, err := utils.ShortID(req.GetSandboxId())
if err != nil {
return nil, status.Error(codes.InvalidArgument, "invalid sandbox ID")
}
const autoResumeTransitionWaitBudget = time.Minute

func (s *SandboxService) getAutoResumeSnapshot(ctx context.Context, sandboxID string) (*snapshotcache.SnapshotInfo, *dbtypes.SandboxAutoResumeConfig, error) {
snap, err := s.api.snapshotCache.Get(ctx, sandboxID)
if err != nil {
if errors.Is(err, snapshotcache.ErrSnapshotNotFound) {
return nil, status.Error(codes.NotFound, "snapshot not found")
return nil, nil, status.Error(codes.NotFound, "snapshot not found")
}

return nil, status.Errorf(codes.Internal, "failed to get snapshot: %v", err)
return nil, nil, status.Errorf(codes.Internal, "failed to get snapshot: %v", err)
}

teamID := snap.Snapshot.TeamID

var autoResume *dbtypes.SandboxAutoResumeConfig
if snap.Snapshot.Config != nil {
autoResume = snap.Snapshot.Config.AutoResume
}
if autoResume == nil || autoResume.Policy != dbtypes.SandboxAutoResumeAny {
return nil, status.Error(codes.NotFound, "sandbox auto-resume disabled")
return nil, nil, status.Error(codes.NotFound, "sandbox auto-resume disabled")
}

return snap, autoResume, nil
}

func (s *SandboxService) ResumeSandbox(ctx context.Context, req *proxygrpc.SandboxResumeRequest) (*proxygrpc.SandboxResumeResponse, error) {
sandboxID, err := utils.ShortID(req.GetSandboxId())
if err != nil {
return nil, status.Error(codes.InvalidArgument, "invalid sandbox ID")
}

var autoResume *dbtypes.SandboxAutoResumeConfig
snap, _, err := s.getAutoResumeSnapshot(ctx, sandboxID)
if err != nil {
return nil, err
}

teamID := snap.Snapshot.TeamID

sandboxData, sandboxErr := s.api.orchestrator.GetSandbox(ctx, teamID, sandboxID)
if sandboxErr != nil {
if !errors.Is(sandboxErr, sandbox.ErrNotFound) {
return nil, status.Errorf(codes.Internal, "failed to get sandbox state: %v", sandboxErr)
}
} else {
nodeIP, handled, existingErr := s.api.orchestrator.HandleExistingSandboxAutoResume(
ctx,
teamID,
sandboxID,
sandboxData,
autoResumeTransitionWaitBudget,
)
if existingErr != nil {
if errors.Is(existingErr, apiorchestrator.ErrSandboxStillTransitioning) {
return nil, status.Error(codes.FailedPrecondition, proxygrpc.SandboxStillTransitioningMessage)
}
if errors.Is(existingErr, sandbox.ErrNotFound) {
return nil, status.Error(codes.NotFound, "sandbox not found")
}
if errors.Is(existingErr, context.Canceled) || errors.Is(existingErr, context.DeadlineExceeded) {
return nil, status.FromContextError(existingErr).Err()
}

return nil, status.Error(codes.Internal, existingErr.Error())
}
if handled {
return &proxygrpc.SandboxResumeResponse{OrchestratorIp: nodeIP}, nil
Comment thread
matthewlouisbrockman marked this conversation as resolved.
}
}
Comment thread
matthewlouisbrockman marked this conversation as resolved.

// Reload snapshot metadata after orchestrator checks so we do not resume from stale
// pre-pause snapshot data.
snap, autoResume, err = s.getAutoResumeSnapshot(ctx, sandboxID)
if err != nil {
return nil, err
}

teamID = snap.Snapshot.TeamID

team, err := dbapi.GetTeamByID(ctx, s.api.authDB, teamID)
if err != nil {
return nil, status.Errorf(codes.Internal, "failed to get team: %v", err)
}
minAutoResumeTimeout := time.Duration(s.api.featureFlags.IntFlag(ctx, featureflags.MinAutoResumeTimeoutSeconds)) * time.Second

timeout := calculateAutoResumeTimeout(autoResume, minAutoResumeTimeout, team)

autoPause := snap.Snapshot.AutoPause
Expand Down
123 changes: 123 additions & 0 deletions packages/api/internal/orchestrator/autoresume.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package orchestrator

import (
"context"
"errors"
"fmt"
"time"

"github.com/google/uuid"
"go.uber.org/zap"

apisandbox "github.com/e2b-dev/infra/packages/api/internal/sandbox"
sharedproxygrpc "github.com/e2b-dev/infra/packages/shared/pkg/grpc/proxy"
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
)

const MaxAutoResumeTransitionRetries = 3

var ErrSandboxStillTransitioning = errors.New(sharedproxygrpc.SandboxStillTransitioningMessage)

func (o *Orchestrator) HandleExistingSandboxAutoResume(
ctx context.Context,
teamID uuid.UUID,
sandboxID string,
sbx apisandbox.Sandbox,
transitionWaitBudget time.Duration,
) (string, bool, error) {
transitionCtx, cancel := context.WithTimeout(ctx, transitionWaitBudget)
defer cancel()

attempts := 0

// Existing sandbox auto-resume state handling:
// - running: return the current node IP immediately
// - pausing/snapshotting: wait for the transition, refresh state, and retry
// - killing: treat as not found
// - anything else: return internal error
// - internal transition wait timeout: treat as "still transitioning"
// - caller cancellation/deadline: propagate the context error
for {
switch sbx.State {
case apisandbox.StatePausing, apisandbox.StateSnapshotting:
if attempts >= MaxAutoResumeTransitionRetries {
logger.L().Warn(
ctx,
"Sandbox is still transitioning after auto-resume retries",
logger.WithSandboxID(sandboxID),
zap.String("state", string(sbx.State)),
zap.Int("attempts", attempts),
)

return "", false, ErrSandboxStillTransitioning
}

attempts++
waitErrMsg := "error waiting for sandbox to pause"
if sbx.State == apisandbox.StatePausing {
logger.L().Debug(ctx, "Waiting for sandbox to pause before auto-resume", logger.WithSandboxID(sandboxID), zap.Int("attempt", attempts))
} else {
waitErrMsg = "error waiting for sandbox snapshot to finish"
logger.L().Debug(ctx, "Waiting for sandbox snapshot to finish before auto-resume", logger.WithSandboxID(sandboxID), zap.Int("attempt", attempts))
}

err := o.WaitForStateChange(transitionCtx, teamID, sandboxID)
if err != nil {
if errors.Is(transitionCtx.Err(), context.DeadlineExceeded) && ctx.Err() == nil {
logger.L().Warn(
ctx,
"Sandbox transition wait timed out during auto-resume",
logger.WithSandboxID(sandboxID),
zap.String("state", string(sbx.State)),
zap.Int("attempt", attempts),
zap.Duration("budget", transitionWaitBudget),
)

return "", false, ErrSandboxStillTransitioning
}

if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return "", false, err
}

return "", false, errors.New(waitErrMsg)
}

updatedSandbox, getSandboxErr := o.GetSandbox(ctx, teamID, sandboxID)
if getSandboxErr == nil {
sbx = updatedSandbox

continue
}
if errors.Is(getSandboxErr, apisandbox.ErrNotFound) {
return "", false, nil
}

return "", false, fmt.Errorf("failed to refresh sandbox state: %w", getSandboxErr)
case apisandbox.StateKilling:
logger.L().Debug(ctx, "Sandbox is being killed, cannot auto-resume", logger.WithSandboxID(sandboxID))

return "", false, apisandbox.ErrNotFound
case apisandbox.StateRunning:
node := o.getOrConnectNode(ctx, sbx.ClusterID, sbx.NodeID)
if node == nil {
logger.L().Error(
ctx,
"Sandbox is running but routing info is not available during auto-resume",
logger.WithSandboxID(sandboxID),
logger.WithTeamID(teamID.String()),
logger.WithNodeID(sbx.NodeID),
zap.Stringer("cluster_id", sbx.ClusterID),
)

return "", false, errors.New("sandbox is running but routing info is not available yet")
Comment thread
cursor[bot] marked this conversation as resolved.
}

return node.IPAddress, true, nil
default:
logger.L().Error(ctx, "Sandbox is in an unknown state during auto-resume", logger.WithSandboxID(sandboxID), zap.String("state", string(sbx.State)))

return "", false, errors.New("sandbox is in an unknown state")
}
}
}
Loading
Loading