fix(webapp): scope leaderElection-lost recovery to reconnect strategy

ericallam · ericallam · commit 499060eebdf3 · 2026-05-15T18:43:11.000+01:00
The previous commit routed leaderElection(false) through handle(), which
under the exit strategy schedules process.exit. In a multi-instance
deployment that turns lost leader election — a normal operational state
— into a restart loop: exit, supervisor restarts, election fails again,
exit, and so on.

Add a dedicated notifyLeaderElectionLost() on ReplicationErrorRecovery
that the reconnect strategy treats as another retry trigger, while
exit and log strategies no-op. Wire the wrapper services through the
new method.
diff --git a/apps/webapp/app/services/replicationErrorRecovery.server.ts b/apps/webapp/app/services/replicationErrorRecovery.server.ts
@@ -44,6 +44,11 @@ export type ReplicationErrorRecovery = {
   // Called from the replication client's "start" event handler. Resets the
   // reconnect attempt counter so the next failure starts from initialDelayMs.
   notifyStreamStarted(): void;
+  // Called from the replication client's "leaderElection" event handler with
+  // isLeader=false. Only the reconnect strategy acts on this; exit and log
+  // strategies treat losing the lock as a normal multi-instance state (an
+  // "exit" instance would otherwise restart-loop whenever a peer holds it).
+  notifyLeaderElectionLost(error: unknown): void;
   // Cancel any pending reconnect/exit timer. Called from shutdown().
   dispose(): void;
 };
@@ -145,6 +150,14 @@ export function createReplicationErrorRecovery(
         attempt = 0;
       }
     },
+    notifyLeaderElectionLost(error) {
+      if (isShuttingDown()) return;
+      // Only the reconnect strategy should react. For exit, losing the
+      // lock to a peer would otherwise trigger a restart loop. For log,
+      // we keep historical no-op semantics.
+      if (strategy.type !== "reconnect") return;
+      scheduleReconnect(error);
+    },
     dispose() {
       if (pendingReconnect) {
         clearTimeout(pendingReconnect);
diff --git a/apps/webapp/app/services/runsReplicationService.server.ts b/apps/webapp/app/services/runsReplicationService.server.ts
@@ -289,9 +289,10 @@ export class RunsReplicationService {
       if (!isLeader) {
         // Failed leader election doesn't throw or emit an "error" event —
         // subscribe() just emits leaderElection(false), calls stop(), and
-        // returns. Nudge the recovery handler so reconnect doesn't silently
-        // stall when another instance holds the lock.
-        this._errorRecovery.handle(
+        // returns. Route through a dedicated handler so only the reconnect
+        // strategy acts; the exit strategy must not restart-loop when
+        // another instance holds the lock.
+        this._errorRecovery.notifyLeaderElectionLost(
           new Error("Failed to acquire replication leader lock")
         );
       }
diff --git a/apps/webapp/app/services/sessionsReplicationService.server.ts b/apps/webapp/app/services/sessionsReplicationService.server.ts
@@ -269,7 +269,7 @@ export class SessionsReplicationService {
       this.logger.info("Leader election", { isLeader });
       if (!isLeader) {
         // See RunsReplicationService for the rationale.
-        this._errorRecovery.handle(
+        this._errorRecovery.notifyLeaderElectionLost(
           new Error("Failed to acquire replication leader lock")
         );
       }

Original file line number	Diff line number	Diff line change
`@@ -269,7 +269,7 @@ export class SessionsReplicationService {`
`269`	`269`	`this.logger.info("Leader election", { isLeader });`
`270`	`270`	`if (!isLeader) {`
`271`	`271`	`// See RunsReplicationService for the rationale.`
`272`		`- this._errorRecovery.handle(`
	`272`	`+ this._errorRecovery.notifyLeaderElectionLost(`
`273`	`273`	`new Error("Failed to acquire replication leader lock")`
`274`	`274`	`);`
`275`	`275`	`}`