stacklok · yrobla · Dec 18, 2025 · Dec 22, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/cmd/thv-operator/controllers/virtualmcpserver_controller.go b/cmd/thv-operator/controllers/virtualmcpserver_controller.go
diff --git a/cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go b/cmd/thv-operator/pkg/virtualmcpserverstatus/collector.go
@@ -94,6 +94,16 @@ func (s *StatusCollector) SetDiscoveredBackends(backends []mcpv1alpha1.Discovere
 	s.hasChanges = true
 }
 
+// GetDiscoveredBackends returns the current discovered backends value.
+// If SetDiscoveredBackends has been called, returns the new value.
+// Otherwise, returns the existing value from the VirtualMCPServer status.
+func (s *StatusCollector) GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend {
+	if s.discoveredBackends != nil {
+		return s.discoveredBackends
+	}
+	return s.vmcp.Status.DiscoveredBackends
+}
+
 // UpdateStatus applies all collected status changes in a single batch update.
 // Expects vmcpStatus to be freshly fetched from the cluster to ensure the update operates on the latest resource version.
 func (s *StatusCollector) UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool {

diff --git a/cmd/thv-operator/pkg/virtualmcpserverstatus/mocks/mock_collector.go b/cmd/thv-operator/pkg/virtualmcpserverstatus/mocks/mock_collector.go
diff --git a/cmd/thv-operator/pkg/virtualmcpserverstatus/types.go b/cmd/thv-operator/pkg/virtualmcpserverstatus/types.go
@@ -44,6 +44,11 @@ type StatusManager interface {
 	// SetDiscoveredBackends sets the discovered backends list
 	SetDiscoveredBackends(backends []mcpv1alpha1.DiscoveredBackend)
 
+	// GetDiscoveredBackends returns the current discovered backends value.
+	// If SetDiscoveredBackends has been called, returns the new value.
+	// Otherwise, returns the existing value from the VirtualMCPServer status.
+	GetDiscoveredBackends() []mcpv1alpha1.DiscoveredBackend
+
 	// UpdateStatus applies all collected status changes in a single batch update.
 	// Returns true if updates were applied, false if no changes were collected.
 	UpdateStatus(ctx context.Context, vmcpStatus *mcpv1alpha1.VirtualMCPServerStatus) bool

diff --git a/cmd/thv-operator/pkg/vmcpconfig/converter.go b/cmd/thv-operator/pkg/vmcpconfig/converter.go
@@ -124,7 +124,6 @@ func (c *Converter) Convert(
 	}
 
 	config.Telemetry = spectoconfig.ConvertTelemetryConfig(ctx, vmcp.Spec.Telemetry, vmcp.Name)
-	config.Audit = spectoconfig.ConvertAuditConfig(ctx, vmcp.Spec.Audit, vmcp.Name)
 
 	// Apply operational defaults (fills missing values)
 	config.EnsureOperationalDefaults()
@@ -863,7 +862,7 @@ func convertOutputProperty(
 
 // convertOperational converts OperationalConfig from CRD to vmcp config
 func (*Converter) convertOperational(
-	_ context.Context,
+	ctx context.Context,
 	vmcp *mcpv1alpha1.VirtualMCPServer,
 ) *vmcpconfig.OperationalConfig {
 	operational := &vmcpconfig.OperationalConfig{}
@@ -896,7 +895,12 @@ func (*Converter) convertOperational(
 
 		// Parse health check interval
 		if vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
-			if duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval); err == nil {
+			duration, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
+			if err != nil {
+				ctxLogger := log.FromContext(ctx)
+				ctxLogger.Error(err, "Failed to parse HealthCheckInterval, health monitoring will be disabled",
+					"value", vmcp.Spec.Operational.FailureHandling.HealthCheckInterval)
+			} else {
 				operational.FailureHandling.HealthCheckInterval = vmcpconfig.Duration(duration)
 			}
 		}

diff --git a/cmd/vmcp/README.md b/cmd/vmcp/README.md
@@ -15,12 +15,12 @@ The Virtual MCP Server (vmcp) is a standalone binary that aggregates multiple MC
 - ✅ **Health Endpoints**: `/health` and `/ping` for service monitoring
 - ✅ **Configuration Validation**: `vmcp validate` command for config verification
 - ✅ **Observability**: OpenTelemetry metrics and traces for backend operations and workflow executions
+- ✅ **Health Monitoring & Circuit Breaker**: Per-backend health checks with circuit breaker pattern to prevent cascading failures
 
 ### In Progress
 - 🚧 **Incoming Authentication** (Issue #165): OIDC, local, anonymous authentication
 - 🚧 **Outgoing Authentication** (Issue #160): RFC 8693 token exchange for backend API access
 - 🚧 **Token Caching**: Memory and Redis cache providers
-- 🚧 **Health Monitoring** (Issue #166): Circuit breakers, backend health checks
 
 ### Future (Phase 2+)
 - 📋 **Authorization**: Cedar policy-based access control
@@ -208,6 +208,93 @@ aggregation:
           description: "Create a GitHub pull request"
 ```
 
+## Health Monitoring & Circuit Breaker
+
+Virtual MCP continuously monitors backend health and implements a circuit breaker pattern to prevent cascading failures when backends become unhealthy.
+
+### Health Monitoring
+
+Virtual MCP performs periodic health checks on all backend workloads using MCP's `ListCapabilities` operation. Health status is tracked per-backend and exposed through the VirtualMCPServer status in Kubernetes.
+
+**Configuration:**
+
+```yaml
+operational:
+  failure_handling:
+    health_check_interval: "30s"     # How often to check backend health
+    unhealthy_threshold: 3           # Failures before marking unhealthy
+    partial_failure_mode: best_effort # Continue with available backends
+```
+
+**Health States:**
+- **Healthy**: Backend responding normally
+- **Degraded**: Backend experiencing issues but still functional (circuit half-open)
+- **Unhealthy**: Backend failing health checks (circuit open)
+
+### Circuit Breaker
+
+The circuit breaker prevents overwhelming failing backends by fast-failing requests when a backend is detected as unhealthy. This provides faster failure detection and reduces resource waste.
+
+**How It Works:**
+
+1. **Closed (Normal)**: All health check requests are sent to the backend
+2. **Open (Fast-Fail)**: After `failure_threshold` consecutive failures, the circuit opens and health checks are skipped
+3. **Half-Open (Testing)**: After `timeout` period, the circuit allows a test request to check if the backend has recovered
+4. **Recovery**: On successful health check in half-open state, the circuit closes and normal operation resumes
+
+**Configuration:**
+
+```yaml
+operational:
+  failure_handling:
+    circuit_breaker:
+      enabled: true              # Enable circuit breaker (default: false)
+      failure_threshold: 5       # Open circuit after N failures (default: 5)
+      timeout: "60s"             # Wait before testing recovery (default: 60s)
+```
+
+**Benefits:**
+- **Faster failure detection**: No need to wait for timeouts on unhealthy backends
+- **Reduced resource usage**: Skip health checks for known-unhealthy backends
+- **Graceful degradation**: Continue serving requests with healthy backends using `partial_failure_mode: best_effort`
+- **Automatic recovery**: Circuit automatically tests for backend recovery after timeout
+
+**Per-Backend Isolation**: Each backend has its own independent circuit breaker. If one backend fails, others continue operating normally.
+
+**Example Configuration:**
+
+```yaml
+name: "production-vmcp"
+group: "prod-services"
+incoming_auth:
+  type: anonymous
+outgoing_auth:
+  source: inline
+  default:
+    type: unauthenticated
+aggregation:
+  conflict_resolution: prefix
+operational:
+  failure_handling:
+    health_check_interval: "30s"
+    unhealthy_threshold: 3
+    partial_failure_mode: best_effort
+    circuit_breaker:
+      enabled: true
+      failure_threshold: 5
+      timeout: "60s"
+```
+
+**Monitoring Circuit State:**
+
+In Kubernetes, check the VirtualMCPServer status to see backend health and circuit states:
+
+```bash
+kubectl get virtualmcpserver my-vmcp -o yaml
+```
+
+Look for `status.discoveredBackends` to see each backend's health status.
+
 ## Architecture
 
 ```

diff --git a/cmd/vmcp/app/commands.go b/cmd/vmcp/app/commands.go
@@ -20,6 +20,7 @@ import (
 	vmcpclient "github.com/stacklok/toolhive/pkg/vmcp/client"
 	"github.com/stacklok/toolhive/pkg/vmcp/config"
 	"github.com/stacklok/toolhive/pkg/vmcp/discovery"
+	"github.com/stacklok/toolhive/pkg/vmcp/health"
 	vmcprouter "github.com/stacklok/toolhive/pkg/vmcp/router"
 	vmcpserver "github.com/stacklok/toolhive/pkg/vmcp/server"
 )
@@ -253,6 +254,8 @@ func discoverBackends(ctx context.Context, cfg *config.Config) ([]vmcp.Backend,
 }
 
 // runServe implements the serve command logic
+//
+//nolint:gocyclo // Complexity from server initialization and configuration is acceptable
 func runServe(cmd *cobra.Command, _ []string) error {
 	ctx := cmd.Context()
 	configPath := viper.GetString("config")
@@ -330,15 +333,52 @@ func runServe(cmd *cobra.Command, _ []string) error {
 		}()
 	}
 
+	// Configure health monitoring if enabled
+	var healthMonitorConfig *health.MonitorConfig
+	if cfg.Operational != nil && cfg.Operational.FailureHandling != nil && cfg.Operational.FailureHandling.HealthCheckInterval > 0 {
+		// Validate configuration values before creating MonitorConfig
+		// This provides defense in depth and clearer error messages
+		checkInterval := time.Duration(cfg.Operational.FailureHandling.HealthCheckInterval)
+		if checkInterval <= 0 {
+			return fmt.Errorf("invalid health check configuration: check interval must be > 0, got %v", checkInterval)
+		}
+		if cfg.Operational.FailureHandling.UnhealthyThreshold < 1 {
+			return fmt.Errorf("invalid health check configuration: unhealthy threshold must be >= 1, got %d",
+				cfg.Operational.FailureHandling.UnhealthyThreshold)
+		}
+
+		// Map circuit breaker config if enabled
+		var circuitBreakerConfig *health.CircuitBreakerConfig
+		if cfg.Operational.FailureHandling.CircuitBreaker != nil &&
+			cfg.Operational.FailureHandling.CircuitBreaker.Enabled {
+			circuitBreakerConfig = &health.CircuitBreakerConfig{
+				Enabled:          true,
+				FailureThreshold: cfg.Operational.FailureHandling.CircuitBreaker.FailureThreshold,
+				Timeout:          time.Duration(cfg.Operational.FailureHandling.CircuitBreaker.Timeout),
+			}
+		}
+
+		defaults := health.DefaultConfig()
+		healthMonitorConfig = &health.MonitorConfig{
+			CheckInterval:      checkInterval,
+			UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
+			Timeout:            defaults.Timeout,
+			DegradedThreshold:  defaults.DegradedThreshold,
+			CircuitBreaker:     circuitBreakerConfig,
+		}
+		logger.Info("Health monitoring configured from operational settings")
+	}
+
 	serverCfg := &vmcpserver.Config{
-		Name:              cfg.Name,
-		Version:           getVersion(),
-		Host:              host,
-		Port:              port,
-		AuthMiddleware:    authMiddleware,
-		AuthInfoHandler:   authInfoHandler,
-		TelemetryProvider: telemetryProvider,
-		AuditConfig:       cfg.Audit,
+		Name:                cfg.Name,
+		Version:             getVersion(),
+		Host:                host,
+		Port:                port,
+		AuthMiddleware:      authMiddleware,
+		AuthInfoHandler:     authInfoHandler,
+		TelemetryProvider:   telemetryProvider,
+		AuditConfig:         cfg.Audit,
+		HealthMonitorConfig: healthMonitorConfig,
 	}
 
 	// Convert composite tool configurations to workflow definitions