Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
403 changes: 374 additions & 29 deletions cmd/thv-operator/controllers/virtualmcpserver_controller.go

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ func (s *StatusCollector) SetDiscoveredBackends(backends []mcpv1alpha1.Discovere
s.hasChanges = true
}

// GetDiscoveredBackends returns the current discovered backends value.
// If SetDiscoveredBackends has been called, returns the new value.
// Otherwise, returns the existing value from the VirtualMCPServer status.
func (s *StatusCollector) GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend {
if s.discoveredBackends != nil {
return s.discoveredBackends
}
return s.vmcp.Status.DiscoveredBackends
}

// UpdateStatus applies all collected status changes in a single batch update.
// Expects vmcpStatus to be freshly fetched from the cluster to ensure the update operates on the latest resource version.
func (s *StatusCollector) UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool {
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions cmd/thv-operator/pkg/virtualmcpserverstatus/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ type StatusManager interface {
// SetDiscoveredBackends sets the discovered backends list
SetDiscoveredBackends(backends []mcpv1alpha1.DiscoveredBackend)

// GetDiscoveredBackends returns the current discovered backends value.
// If SetDiscoveredBackends has been called, returns the new value.
// Otherwise, returns the existing value from the VirtualMCPServer status.
GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend

// UpdateStatus applies all collected status changes in a single batch update.
// Returns true if updates were applied, false if no changes were collected.
UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool
Expand Down
10 changes: 7 additions & 3 deletions cmd/thv-operator/pkg/vmcpconfig/converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ func (c *Converter) Convert(
}

config.Telemetry = spectoconfig.ConvertTelemetryConfig(ctx, vmcp.Spec.Telemetry, vmcp.Name)
config.Audit = spectoconfig.ConvertAuditConfig(ctx, vmcp.Spec.Audit, vmcp.Name)

// Apply operational defaults (fills missing values)
config.EnsureOperationalDefaults()
Expand Down Expand Up @@ -863,7 +862,7 @@ func convertOutputProperty(

// convertOperational converts OperationalConfig from CRD to vmcp config
func (*Converter) convertOperational(
_ context.Context,
ctx context.Context,
vmcp *mcpv1alpha1.VirtualMCPServer,
) *vmcpconfig.OperationalConfig {
operational := &vmcpconfig.OperationalConfig{}
Expand Down Expand Up @@ -896,7 +895,12 @@ func (*Converter) convertOperational(

// Parse health check interval
if vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
if duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval); err == nil {
duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
if err != nil {
ctxLogger := log.FromContext(ctx)
ctxLogger.Error(err, "Failed to parse HealthCheckInterval, health monitoring will be disabled",
"value", vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
} else {
operational.FailureHandling.HealthCheckInterval = vmcpconfig.Duration(duration)
}
}
Expand Down
89 changes: 88 additions & 1 deletion cmd/vmcp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ The Virtual MCP Server (vmcp) is a standalone binary that aggregates multiple MC
- ✅ **Health Endpoints**: `/health` and `/ping` for service monitoring
- ✅ **Configuration Validation**: `vmcp validate` command for config verification
- ✅ **Observability**: OpenTelemetry metrics and traces for backend operations and workflow executions
- ✅ **Health Monitoring & Circuit Breaker**: Per-backend health checks with circuit breaker pattern to prevent cascading failures

### In Progress
- 🚧 **Incoming Authentication** (Issue #165): OIDC, local, anonymous authentication
- 🚧 **Outgoing Authentication** (Issue #160): RFC 8693 token exchange for backend API access
- 🚧 **Token Caching**: Memory and Redis cache providers
- 🚧 **Health Monitoring** (Issue #166): Circuit breakers, backend health checks

### Future (Phase 2+)
- 📋 **Authorization**: Cedar policy-based access control
Expand Down Expand Up @@ -208,6 +208,93 @@ aggregation:
description: "Create a GitHub pull request"
```

## Health Monitoring & Circuit Breaker

Virtual MCP continuously monitors backend health and implements a circuit breaker pattern to prevent cascading failures when backends become unhealthy.

### Health Monitoring

Virtual MCP performs periodic health checks on all backend workloads using MCP's `ListCapabilities` operation. Health status is tracked per-backend and exposed through the VirtualMCPServer status in Kubernetes.

**Configuration:**

```yaml
operational:
failure_handling:
health_check_interval: "30s" # How often to check backend health
unhealthy_threshold: 3 # Failures before marking unhealthy
partial_failure_mode: best_effort # Continue with available backends
```

**Health States:**
- **Healthy**: Backend responding normally
- **Degraded**: Backend experiencing issues but still functional (circuit half-open)
- **Unhealthy**: Backend failing health checks (circuit open)

### Circuit Breaker

The circuit breaker prevents overwhelming failing backends by fast-failing requests when a backend is detected as unhealthy. This provides faster failure detection and reduces resource waste.

**How It Works:**

1. **Closed (Normal)**: All health check requests are sent to the backend
2. **Open (Fast-Fail)**: After `failure_threshold` consecutive failures, the circuit opens and health checks are skipped
3. **Half-Open (Testing)**: After `timeout` period, the circuit allows a test request to check if the backend has recovered
4. **Recovery**: On successful health check in half-open state, the circuit closes and normal operation resumes

**Configuration:**

```yaml
operational:
failure_handling:
circuit_breaker:
enabled: true # Enable circuit breaker (default: false)
failure_threshold: 5 # Open circuit after N failures (default: 5)
timeout: "60s" # Wait before testing recovery (default: 60s)
```

**Benefits:**
- **Faster failure detection**: No need to wait for timeouts on unhealthy backends
- **Reduced resource usage**: Skip health checks for known-unhealthy backends
- **Graceful degradation**: Continue serving requests with healthy backends using `partial_failure_mode: best_effort`
- **Automatic recovery**: Circuit automatically tests for backend recovery after timeout

**Per-Backend Isolation**: Each backend has its own independent circuit breaker. If one backend fails, others continue operating normally.

**Example Configuration:**

```yaml
name: "production-vmcp"
group: "prod-services"
incoming_auth:
type: anonymous
outgoing_auth:
source: inline
default:
type: unauthenticated
aggregation:
conflict_resolution: prefix
operational:
failure_handling:
health_check_interval: "30s"
unhealthy_threshold: 3
partial_failure_mode: best_effort
circuit_breaker:
enabled: true
failure_threshold: 5
timeout: "60s"
```

**Monitoring Circuit State:**

In Kubernetes, check the VirtualMCPServer status to see backend health and circuit states:

```bash
kubectl get virtualmcpserver my-vmcp -o yaml
```

Look for `status.discoveredBackends` to see each backend's health status.

## Architecture

```
Expand Down
56 changes: 48 additions & 8 deletions cmd/vmcp/app/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
vmcpclient "github.com/stacklok/toolhive/pkg/vmcp/client"
"github.com/stacklok/toolhive/pkg/vmcp/config"
"github.com/stacklok/toolhive/pkg/vmcp/discovery"
"github.com/stacklok/toolhive/pkg/vmcp/health"
vmcprouter "github.com/stacklok/toolhive/pkg/vmcp/router"
vmcpserver "github.com/stacklok/toolhive/pkg/vmcp/server"
)
Expand Down Expand Up @@ -253,6 +254,8 @@ func discoverBackends(ctx context.Context, cfg *config.Config) ([]vmcp.Backend,
}

// runServe implements the serve command logic
//
//nolint:gocyclo // Complexity from server initialization and configuration is acceptable
func runServe(cmd *cobra.Command, _ []string) error {
ctx := cmd.Context()
configPath := viper.GetString("config")
Expand Down Expand Up @@ -330,15 +333,52 @@ func runServe(cmd *cobra.Command, _ []string) error {
}()
}

// Configure health monitoring if enabled
var healthMonitorConfig *health.MonitorConfig
if cfg.Operational != nil && cfg.Operational.FailureHandling != nil && cfg.Operational.FailureHandling.HealthCheckInterval > 0 {
// Validate configuration values before creating MonitorConfig
// This provides defense in depth and clearer error messages
checkInterval := time.Duration(cfg.Operational.FailureHandling.HealthCheckInterval)
if checkInterval <= 0 {
return fmt.Errorf("invalid health check configuration: check interval must be > 0, got %v", checkInterval)
}
if cfg.Operational.FailureHandling.UnhealthyThreshold < 1 {
return fmt.Errorf("invalid health check configuration: unhealthy threshold must be >= 1, got %d",
cfg.Operational.FailureHandling.UnhealthyThreshold)
}

// Map circuit breaker config if enabled
var circuitBreakerConfig *health.CircuitBreakerConfig
if cfg.Operational.FailureHandling.CircuitBreaker != nil &&
cfg.Operational.FailureHandling.CircuitBreaker.Enabled {
circuitBreakerConfig = &health.CircuitBreakerConfig{
Enabled: true,
FailureThreshold: cfg.Operational.FailureHandling.CircuitBreaker.FailureThreshold,
Timeout: time.Duration(cfg.Operational.FailureHandling.CircuitBreaker.Timeout),
}
}

defaults := health.DefaultConfig()
healthMonitorConfig = &health.MonitorConfig{
CheckInterval: checkInterval,
UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
Timeout: defaults.Timeout,
DegradedThreshold: defaults.DegradedThreshold,
CircuitBreaker: circuitBreakerConfig,
}
logger.Info("Health monitoring configured from operational settings")
}

serverCfg := &vmcpserver.Config{
Name: cfg.Name,
Version: getVersion(),
Host: host,
Port: port,
AuthMiddleware: authMiddleware,
AuthInfoHandler: authInfoHandler,
TelemetryProvider: telemetryProvider,
AuditConfig: cfg.Audit,
Name: cfg.Name,
Version: getVersion(),
Host: host,
Port: port,
AuthMiddleware: authMiddleware,
AuthInfoHandler: authInfoHandler,
TelemetryProvider: telemetryProvider,
AuditConfig: cfg.Audit,
HealthMonitorConfig: healthMonitorConfig,
}

// Convert composite tool configurations to workflow definitions
Expand Down
Loading
Loading