diff --git a/cmd/atenet/internal/app/router/errors.go b/cmd/atenet/internal/app/router/errors.go new file mode 100644 index 0000000..47bc9a1 --- /dev/null +++ b/cmd/atenet/internal/app/router/errors.go @@ -0,0 +1,86 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package router + +import ( + "fmt" + + envoy_type "github.com/envoyproxy/go-control-plane/envoy/type/v3" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// newReqError builds a reqError whose body is the formatted message and no +// wrapped cause. Set the cause field directly when one is available. +func newReqError(code envoy_type.StatusCode, format string, args ...any) *reqError { + return &reqError{ + msg: fmt.Sprintf(format, args...), + statusCode: int(code), + } +} + +// actorNotFoundErr returns a 404 reqError identifying the missing actor. +func actorNotFoundErr(actorID string) *reqError { + return newReqError(envoy_type.StatusCode_NotFound, "actor %q not found", actorID) +} + +// invalidHostErr returns a 404 reqError explaining why the request host was +// rejected. The cause is preserved for log inspection via Unwrap. +func invalidHostErr(host string, cause error) *reqError { + return &reqError{ + msg: fmt.Sprintf("invalid host %q: %v", host, cause), + cause: cause, + statusCode: int(envoy_type.StatusCode_NotFound), + } +} + +// mapResumeError translates an ActorResumer error into a client-facing +// reqError. It maps gRPC status codes to appropriate HTTP status codes and +// short, human-readable bodies. The original error is preserved via Unwrap +// so callers can still inspect it via errors.Is / errors.As when logging. +// +// Unrecognised errors collapse to 500 with a generic body to avoid leaking +// server-side detail (stack traces, internal IDs) to clients. +func mapResumeError(actorID string, err error) *reqError { + if err == nil { + return nil + } + + var re *reqError + switch status.Code(err) { + case codes.NotFound: + re = actorNotFoundErr(actorID) + case codes.FailedPrecondition: + // Preserve the gRPC description for FailedPrecondition only: it carries + // actionable client-facing context (e.g. "no free workers available") + // and is not security-sensitive. + re = newReqError(envoy_type.StatusCode_ServiceUnavailable, + "actor %q unavailable: %s", actorID, status.Convert(err).Message()) + case codes.Unavailable: + re = newReqError(envoy_type.StatusCode_ServiceUnavailable, "actor %q unavailable", actorID) + case codes.DeadlineExceeded: + re = newReqError(envoy_type.StatusCode_GatewayTimeout, "actor %q request timed out", actorID) + case codes.PermissionDenied: + re = newReqError(envoy_type.StatusCode_Forbidden, "actor %q access denied", actorID) + case codes.Unauthenticated: + re = newReqError(envoy_type.StatusCode_Unauthorized, "actor %q authentication required", actorID) + case codes.ResourceExhausted: + re = newReqError(envoy_type.StatusCode_TooManyRequests, "actor %q rate limited", actorID) + default: + re = newReqError(envoy_type.StatusCode_InternalServerError, "error resuming actor %q", actorID) + } + re.cause = err + return re +} diff --git a/cmd/atenet/internal/app/router/errors_test.go b/cmd/atenet/internal/app/router/errors_test.go new file mode 100644 index 0000000..27ae07f --- /dev/null +++ b/cmd/atenet/internal/app/router/errors_test.go @@ -0,0 +1,178 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package router + +import ( + "errors" + "testing" + + envoy_type "github.com/envoyproxy/go-control-plane/envoy/type/v3" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +func TestNewReqError(t *testing.T) { + t.Parallel() + + err := newReqError(envoy_type.StatusCode_BadRequest, "actor %q is %s", "abc", "bad") + if err == nil { + t.Fatal("newReqError returned nil") + } + if err.statusCode != int(envoy_type.StatusCode_BadRequest) { + t.Errorf("statusCode = %d, want %d", err.statusCode, envoy_type.StatusCode_BadRequest) + } + if got, want := err.Error(), `actor "abc" is bad`; got != want { + t.Errorf("Error() = %q, want %q", got, want) + } +} + +func TestActorNotFoundErr(t *testing.T) { + t.Parallel() + + err := actorNotFoundErr("ctr6") + if err.statusCode != int(envoy_type.StatusCode_NotFound) { + t.Errorf("statusCode = %d, want %d", err.statusCode, envoy_type.StatusCode_NotFound) + } + if got, want := err.Error(), `actor "ctr6" not found`; got != want { + t.Errorf("Error() = %q, want %q", got, want) + } +} + +func TestInvalidHostErr(t *testing.T) { + t.Parallel() + + cause := errors.New("missing suffix") + err := invalidHostErr("foo.example.com", cause) + + if err.statusCode != int(envoy_type.StatusCode_NotFound) { + t.Errorf("statusCode = %d, want %d", err.statusCode, envoy_type.StatusCode_NotFound) + } + if got, want := err.Error(), `invalid host "foo.example.com": missing suffix`; got != want { + t.Errorf("Error() = %q, want %q", got, want) + } + if !errors.Is(err, cause) { + t.Errorf("errors.Is(err, cause) = false, want true (cause should be wrapped for logging)") + } +} + +func TestMapResumeError(t *testing.T) { + t.Parallel() + + const actorID = "ctr6" + + tests := []struct { + name string + err error + wantCode envoy_type.StatusCode + wantBody string + }{ + { + name: "NotFound maps to 404", + err: status.Error(codes.NotFound, "actor not found"), + wantCode: envoy_type.StatusCode_NotFound, + wantBody: `actor "ctr6" not found`, + }, + { + name: "FailedPrecondition maps to 503 and preserves desc", + err: status.Error(codes.FailedPrecondition, "no free workers available"), + wantCode: envoy_type.StatusCode_ServiceUnavailable, + wantBody: `actor "ctr6" unavailable: no free workers available`, + }, + { + name: "Unavailable maps to 503", + err: status.Error(codes.Unavailable, "control-plane down"), + wantCode: envoy_type.StatusCode_ServiceUnavailable, + wantBody: `actor "ctr6" unavailable`, + }, + { + name: "DeadlineExceeded maps to 504", + err: status.Error(codes.DeadlineExceeded, "context deadline exceeded"), + wantCode: envoy_type.StatusCode_GatewayTimeout, + wantBody: `actor "ctr6" request timed out`, + }, + { + name: "PermissionDenied maps to 403", + err: status.Error(codes.PermissionDenied, "denied"), + wantCode: envoy_type.StatusCode_Forbidden, + wantBody: `actor "ctr6" access denied`, + }, + { + name: "Unauthenticated maps to 401", + err: status.Error(codes.Unauthenticated, "no creds"), + wantCode: envoy_type.StatusCode_Unauthorized, + wantBody: `actor "ctr6" authentication required`, + }, + { + name: "ResourceExhausted maps to 429", + err: status.Error(codes.ResourceExhausted, "quota"), + wantCode: envoy_type.StatusCode_TooManyRequests, + wantBody: `actor "ctr6" rate limited`, + }, + { + name: "unknown gRPC code maps to 500 without leaking desc", + err: status.Error(codes.Internal, "stack trace: foo bar"), + wantCode: envoy_type.StatusCode_InternalServerError, + wantBody: `error resuming actor "ctr6"`, + }, + { + name: "non-gRPC error maps to 500 without leaking message", + err: errors.New("raw error with secret"), + wantCode: envoy_type.StatusCode_InternalServerError, + wantBody: `error resuming actor "ctr6"`, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + got := mapResumeError(actorID, tc.err) + if got == nil { + t.Fatal("mapResumeError returned nil") + } + if got.statusCode != int(tc.wantCode) { + t.Errorf("statusCode = %d, want %d", got.statusCode, tc.wantCode) + } + if got.Error() != tc.wantBody { + t.Errorf("Error() = %q, want %q", got.Error(), tc.wantBody) + } + if !errors.Is(got, tc.err) { + t.Errorf("errors.Is(result, original) = false, want true (original must be preserved for logs)") + } + }) + } +} + +func TestMapResumeError_NilError(t *testing.T) { + t.Parallel() + + // Guard against accidental nil-error calls. Returning nil keeps the + // happy path explicit at callsites instead of constructing a bogus 500. + if got := mapResumeError("ctr6", nil); got != nil { + t.Errorf("mapResumeError(_, nil) = %v, want nil", got) + } +} + +// Ensures mapResumeError result satisfies the reqError contract so the +// existing handleRequestHeaders branch (errors.As(err, &reqErr)) keeps working. +func TestMapResumeError_IsReqError(t *testing.T) { + t.Parallel() + + err := mapResumeError("x", status.Error(codes.NotFound, "x")) + var reqErr *reqError + if !errors.As(err, &reqErr) { + t.Fatalf("errors.As(*reqError) = false, want true; err type = %T", err) + } +} diff --git a/cmd/atenet/internal/app/router/extproc.go b/cmd/atenet/internal/app/router/extproc.go index 2e00f3c..7d4a069 100644 --- a/cmd/atenet/internal/app/router/extproc.go +++ b/cmd/atenet/internal/app/router/extproc.go @@ -124,8 +124,7 @@ func (s *ExtProcServer) handleRequestHeaders( actorID, err := parseActorID(metadata.host) if err != nil { - // Host is invalid, respond with 404. - return nil, metadata, "", notFoundErr + return nil, metadata, "", invalidHostErr(metadata.host, err) } slog.InfoContext(ctx, "ResumeActor", slog.String("actorID", actorID)) @@ -137,12 +136,13 @@ func (s *ExtProcServer) handleRequestHeaders( slog.Any("err", err)) if err != nil { - return nil, metadata, "", fmt.Errorf("error resuming actor %s: %w", actorID, err) + return nil, metadata, "", mapResumeError(actorID, err) } workerIP := actor.GetAteomPodIp() if ip := net.ParseIP(workerIP); ip == nil { - return nil, metadata, "", fmt.Errorf("actor %q did not have a valid IP %q", actorID, workerIP) + return nil, metadata, "", newReqError(envoy_type.StatusCode_InternalServerError, + "actor %q routing failed", actorID) } // TODO(bowei) -- handle more than port 80 on the actor. diff --git a/cmd/atenet/internal/app/router/extproc_out.go b/cmd/atenet/internal/app/router/extproc_out.go index c7386c5..cf8b880 100644 --- a/cmd/atenet/internal/app/router/extproc_out.go +++ b/cmd/atenet/internal/app/router/extproc_out.go @@ -15,22 +15,23 @@ package router import ( - "errors" - corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extproc "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" extprocv3 "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoy_type "github.com/envoyproxy/go-control-plane/envoy/type/v3" ) +// reqError carries an HTTP-mappable status code and a client-safe message. +// The underlying cause (if any) is preserved via Unwrap so logs can inspect +// the full chain without leaking server-side detail into the response body. type reqError struct { - error + msg string + cause error statusCode int } -var ( - notFoundErr = &reqError{error: errors.New("not found"), statusCode: int(envoy_type.StatusCode_NotFound)} -) +func (e *reqError) Error() string { return e.msg } +func (e *reqError) Unwrap() error { return e.cause } func addAuthorityMutation(auth string, mut *extprocv3.HeaderMutation) { mut.SetHeaders = append(mut.SetHeaders, diff --git a/cmd/atenet/internal/app/router/extproc_test.go b/cmd/atenet/internal/app/router/extproc_test.go index 07c475c..c1500dc 100644 --- a/cmd/atenet/internal/app/router/extproc_test.go +++ b/cmd/atenet/internal/app/router/extproc_test.go @@ -24,7 +24,10 @@ import ( "github.com/agent-substrate/substrate/pkg/proto/ateapipb" corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extprocv3 "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + envoy_type "github.com/envoyproxy/go-control-plane/envoy/type/v3" "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) type mockClient struct { @@ -45,25 +48,59 @@ func TestExtProcHeadersEvaluation(t *testing.T) { resumeResp *ateapipb.ResumeActorResponse resumeErr error expectErr bool - expectedErr error expectedErrStr string + expectedStatus envoy_type.StatusCode expectedTarget string }{ { - name: "invalid host", - authority: "invalid-host.com", - expectErr: true, - expectedErr: notFoundErr, + name: "invalid host returns 404 identifying the host", + authority: "invalid-host.com", + expectErr: true, + expectedErrStr: `invalid host "invalid-host.com": invalid actor_id: must end with actors.resources.substrate.ate.dev, got "invalid-host.com"`, + expectedStatus: envoy_type.StatusCode_NotFound, + }, + { + name: "non-gRPC resume error collapses to 500 without leaking detail", + authority: testUUID + ".actors.resources.substrate.ate.dev", + resumeErr: errors.New("resume failed with sensitive detail"), + expectErr: true, + expectedErrStr: `error resuming actor "123e4567-e89b-12d3-a456-426614174000"`, + expectedStatus: envoy_type.StatusCode_InternalServerError, }, { - name: "Error resuming actor", + name: "FailedPrecondition maps to 503 with preserved desc", authority: testUUID + ".actors.resources.substrate.ate.dev", - resumeErr: errors.New("resume failed"), + resumeErr: status.Error(codes.FailedPrecondition, "no free workers available"), expectErr: true, - expectedErrStr: "error resuming actor 123e4567-e89b-12d3-a456-426614174000: resume failed", + expectedErrStr: `actor "123e4567-e89b-12d3-a456-426614174000" unavailable: no free workers available`, + expectedStatus: envoy_type.StatusCode_ServiceUnavailable, }, { - name: "Bad Actor IP from resume", + name: "NotFound maps to 404", + authority: testUUID + ".actors.resources.substrate.ate.dev", + resumeErr: status.Error(codes.NotFound, "actor missing"), + expectErr: true, + expectedErrStr: `actor "123e4567-e89b-12d3-a456-426614174000" not found`, + expectedStatus: envoy_type.StatusCode_NotFound, + }, + { + name: "Unavailable maps to 503", + authority: testUUID + ".actors.resources.substrate.ate.dev", + resumeErr: status.Error(codes.Unavailable, "control-plane down"), + expectErr: true, + expectedErrStr: `actor "123e4567-e89b-12d3-a456-426614174000" unavailable`, + expectedStatus: envoy_type.StatusCode_ServiceUnavailable, + }, + { + name: "DeadlineExceeded maps to 504", + authority: testUUID + ".actors.resources.substrate.ate.dev", + resumeErr: status.Error(codes.DeadlineExceeded, "deadline"), + expectErr: true, + expectedErrStr: `actor "123e4567-e89b-12d3-a456-426614174000" request timed out`, + expectedStatus: envoy_type.StatusCode_GatewayTimeout, + }, + { + name: "Bad Actor IP from resume returns 500 without leaking IP", authority: testUUID + ".actors.resources.substrate.ate.dev", resumeResp: &ateapipb.ResumeActorResponse{ Actor: &ateapipb.Actor{ @@ -71,7 +108,8 @@ func TestExtProcHeadersEvaluation(t *testing.T) { }, }, expectErr: true, - expectedErrStr: "actor \"123e4567-e89b-12d3-a456-426614174000\" did not have a valid IP \"invalid-ip\"", + expectedErrStr: `actor "123e4567-e89b-12d3-a456-426614174000" routing failed`, + expectedStatus: envoy_type.StatusCode_InternalServerError, }, { name: "Successful resume", @@ -117,11 +155,18 @@ func TestExtProcHeadersEvaluation(t *testing.T) { if err == nil { t.Fatalf("expected error but got nil") } - if tc.expectedErr != nil && !errors.Is(err, tc.expectedErr) { - t.Errorf("expected error %v, got %v", tc.expectedErr, err) - } if tc.expectedErrStr != "" && err.Error() != tc.expectedErrStr { - t.Errorf("expected error string %q, got %q", tc.expectedErrStr, err.Error()) + t.Errorf("client body mismatch:\n got: %q\n want: %q", err.Error(), tc.expectedErrStr) + } + var reqErr *reqError + if !errors.As(err, &reqErr) { + t.Fatalf("expected *reqError, got %T (%v)", err, err) + } + if got, want := reqErr.statusCode, int(tc.expectedStatus); got != want { + t.Errorf("HTTP status code = %d, want %d", got, want) + } + if tc.resumeErr != nil && !errors.Is(err, tc.resumeErr) { + t.Errorf("original resume error must be preserved in chain for logs; errors.Is(err, resumeErr) = false") } return } diff --git a/cmd/atenet/internal/app/router/resumer.go b/cmd/atenet/internal/app/router/resumer.go index 74a75d0..eed4d03 100644 --- a/cmd/atenet/internal/app/router/resumer.go +++ b/cmd/atenet/internal/app/router/resumer.go @@ -68,9 +68,9 @@ func (r *ActorResumer) ResumeActor(ctx context.Context, actorID string) (*ateapi if status.Code(err) == codes.Aborted { return false, nil // Concurrent resume call, retry. } - if status.Code(err) == codes.NotFound { - return false, notFoundErr - } + // Other gRPC errors (NotFound, FailedPrecondition, Unavailable, + // DeadlineExceeded, ...) are returned to the caller unchanged so + // the HTTP boundary can map them with full fidelity. return false, err }) diff --git a/cmd/atenet/internal/app/router/resumer_test.go b/cmd/atenet/internal/app/router/resumer_test.go index 2b71f67..b34e4a0 100644 --- a/cmd/atenet/internal/app/router/resumer_test.go +++ b/cmd/atenet/internal/app/router/resumer_test.go @@ -16,7 +16,6 @@ package router import ( "context" - "errors" "sync" "testing" "time" @@ -111,8 +110,8 @@ func TestActorResumer_ResumeActor(t *testing.T) { resumer := NewActorResumer(mock) _, err := resumer.ResumeActor(context.Background(), testActorID) - if !errors.Is(err, notFoundErr) { - t.Errorf("expected notFoundErr, got %v", err) + if got := status.Code(err); got != codes.NotFound { + t.Errorf("expected gRPC code NotFound, got %v (err=%v)", got, err) } })