From 8108dd56d05f2111f0cd525cef88da68f2252925 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 09:45:41 +0100 Subject: [PATCH 01/23] feat(webapp): add a new backend for the realtime runs feed Adds an opt-in backend for realtime run subscriptions (single runs, tag lists, and batches), selected per organization by a feature flag and gated by a global environment-variable switch, both defaulting off so nothing changes until enabled. Run changes are signalled over Redis pub/sub; a live subscription wakes, refetches the current rows from a read replica, and re-emits them, resolving tag and batch membership from ClickHouse. Concurrent subscribers watching the same runs, tags, or batch share a single resolve-and-hydrate per short window, so read load scales with distinct filters rather than connection count. --- .../realtime-runs-subscription-scalability.md | 6 + apps/webapp/app/entry.server.tsx | 4 + apps/webapp/app/env.server.ts | 25 + .../app/routes/api.v1.runs.$runId.tags.ts | 3 + .../routes/realtime.v1.batches.$batchId.ts | 8 +- .../app/routes/realtime.v1.runs.$runId.ts | 9 +- apps/webapp/app/routes/realtime.v1.runs.ts | 8 +- .../app/services/realtime/boundedTtlCache.ts | 57 ++ .../clickHouseRunListResolver.server.ts | 40 + .../realtime/electricStreamProtocol.server.ts | 301 +++++++ .../realtime/notifierRealtimeClient.server.ts | 752 ++++++++++++++++++ .../notifierRealtimeClientInstance.server.ts | 99 +++ .../realtimeConcurrencyLimiter.server.ts | 111 +++ .../resolveRealtimeStreamClient.server.ts | 86 ++ .../realtime/runChangeNotifier.server.ts | 228 ++++++ .../runChangeNotifierHandlers.server.ts | 73 ++ .../runChangeNotifierInstance.server.ts | 73 ++ .../app/services/realtime/runReader.server.ts | 191 +++++ .../services/realtime/shadowCompare.server.ts | 289 +++++++ .../realtime/shadowRealtimeClient.server.ts | 192 +++++ .../shadowRealtimeClientInstance.server.ts | 66 ++ apps/webapp/app/v3/featureFlags.ts | 5 + .../test/realtime/boundedTtlCache.test.ts | 41 + .../realtime/electricStreamProtocol.test.ts | 304 +++++++ .../realtime/notifierRealtimeClient.test.ts | 107 +++ .../test/realtime/notifierRunSetCache.test.ts | 173 ++++ .../test/realtime/runChangeNotifier.test.ts | 211 +++++ .../test/realtime/runReaderProjection.test.ts | 57 ++ .../test/realtime/shadowCompare.test.ts | 212 +++++ 29 files changed, 3725 insertions(+), 6 deletions(-) create mode 100644 .server-changes/realtime-runs-subscription-scalability.md create mode 100644 apps/webapp/app/services/realtime/boundedTtlCache.ts create mode 100644 apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts create mode 100644 apps/webapp/app/services/realtime/electricStreamProtocol.server.ts create mode 100644 apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts create mode 100644 apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts create mode 100644 apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts create mode 100644 apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts create mode 100644 apps/webapp/app/services/realtime/runChangeNotifier.server.ts create mode 100644 apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts create mode 100644 apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts create mode 100644 apps/webapp/app/services/realtime/runReader.server.ts create mode 100644 apps/webapp/app/services/realtime/shadowCompare.server.ts create mode 100644 apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts create mode 100644 apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts create mode 100644 apps/webapp/test/realtime/boundedTtlCache.test.ts create mode 100644 apps/webapp/test/realtime/electricStreamProtocol.test.ts create mode 100644 apps/webapp/test/realtime/notifierRealtimeClient.test.ts create mode 100644 apps/webapp/test/realtime/notifierRunSetCache.test.ts create mode 100644 apps/webapp/test/realtime/runChangeNotifier.test.ts create mode 100644 apps/webapp/test/realtime/runReaderProjection.test.ts create mode 100644 apps/webapp/test/realtime/shadowCompare.test.ts diff --git a/.server-changes/realtime-runs-subscription-scalability.md b/.server-changes/realtime-runs-subscription-scalability.md new file mode 100644 index 00000000000..5de00aae675 --- /dev/null +++ b/.server-changes/realtime-runs-subscription-scalability.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Add a new backend for the realtime runs feed (single runs, tags, and batches) that scales under high concurrency, available behind a feature flag diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 9996eb7b30a..8cc23bff089 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -27,6 +27,7 @@ import { registerRunEngineEventBusHandlers, setupBatchQueueCallbacks, } from "./v3/runEngineHandlers.server"; +import { registerRunChangeNotifierHandlers } from "./services/realtime/runChangeNotifierHandlers.server"; // Touch the sessions replication singleton at entry so it boots deterministically // on webapp startup. The singleton's initializer wires start (gated on // `clickhouseFactory.isReady()`) and SIGTERM/SIGINT shutdown — mirrors @@ -269,6 +270,9 @@ process.on("uncaughtException", (error, origin) => { singleton("RunEngineEventBusHandlers", registerRunEngineEventBusHandlers); singleton("SetupBatchQueueCallbacks", setupBatchQueueCallbacks); +// Attach the run-changed notifier delegations to the engine event bus. +// No-ops (registers nothing) unless REALTIME_NOTIFIER_ENABLED=1. +singleton("RunChangeNotifierHandlers", registerRunChangeNotifierHandlers); // Wrapped in singleton() so Remix's dev-mode CJS reloads don't append // duplicate copies of the processor — Sentry's processor list lives in diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c55bb424001..3cdfdbf51fc 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -300,6 +300,31 @@ const EnvironmentSchema = z .int() .default(24 * 60 * 60 * 1000), // 1 day in milliseconds + // Master switch for the notifier-backed realtime feed. + // "0" (default) = the existing realtime path serves everything, publishes are + // no-ops, and no notifier Redis connections are opened (zero-overhead off). + // "1" = run-changed signals are published and the per-org `realtimeBackend` + // feature flag selects the backend per request. + REALTIME_NOTIFIER_ENABLED: z.string().default("0"), + // Backstop wait before a live notifier request refetches the run (ms). + REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(5_000), + // Hard cap on the tag-list snapshot size served by the notifier feed. + REALTIME_NOTIFIER_MAX_LIST_RESULTS: z.coerce.number().int().default(1_000), + // Short-TTL coalescing cache for the multi-run (tag-list/batch) resolve+hydrate. + // Concurrent same-filter feeds share one ClickHouse resolve + Postgres hydrate + // within this window, so an env-wide wake doesn't fan out into per-feed queries. + // Staleness budget: a newly-matching run is visible within ~ttl + poll interval. + REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS: z.coerce.number().int().default(1_000), + REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), + // Cap on the per-handle working-set cache (runId -> updatedAt) the notifier keeps + // for diffing multi-run live polls. + REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES: z.coerce.number().int().default(10_000), + // Quantize the tag-list createdAt lower bound to this epoch-aligned bucket (ms) so + // same-tag feeds that pin their window within the same bucket share one resolve+ + // hydrate cache entry. Floored, so the window only ever widens by < bucket. 0 + // disables bucketing (each feed keeps its exact lower bound). + REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), + PUBSUB_REDIS_HOST: z .string() .optional() diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index ef7f3180bf3..9dd184fa25e 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -7,6 +7,7 @@ import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; +import { publishRunChanged } from "~/services/realtime/runChangeNotifierInstance.server"; import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; // Pull the existing tags out of a buffer entry's serialised payload so @@ -90,6 +91,8 @@ export async function action({ request, params }: ActionFunctionArgs) { }, data: { runTags: { push: newTags } }, }); + // Delegate a run-changed notify (no-op unless enabled). + publishRunChanged({ runId: taskRun.id, environmentId: env.id }); return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, // Buffer-applied patch path. The mutateSnapshot Lua deduplicates diff --git a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts index 2b8fb106681..973cd5f96cd 100644 --- a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts +++ b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts @@ -1,7 +1,7 @@ import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server"; const ParamsSchema = z.object({ @@ -33,7 +33,11 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: batchRun, apiVersion }) => { - return realtimeClient.streamBatch( + // Pick the Electric proxy or the notifier-backed batch feed + // per org (defaults to Electric). Both implement streamBatch. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamBatch( request.url, authentication.environment, batchRun.id, diff --git a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts index e03787c6200..3e224ddedf2 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts @@ -2,7 +2,7 @@ import { json } from "@remix-run/server-runtime"; import { z } from "zod"; import { $replica } from "~/db.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -48,7 +48,12 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: run, apiVersion }) => { - return realtimeClient.streamRun( + // Pick the Electric proxy or the notifier-backed shim per org (defaults to + // Electric; controlled by REALTIME_NOTIFIER_ENABLED + the realtimeBackend + // feature flag). Both implement the same streamRun contract. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRun( request.url, authentication.environment, run.id, diff --git a/apps/webapp/app/routes/realtime.v1.runs.ts b/apps/webapp/app/routes/realtime.v1.runs.ts index b04c2d55bbc..436f4ef48d8 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.ts @@ -1,6 +1,6 @@ import { z } from "zod"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; -import { realtimeClient } from "~/services/realtimeClientGlobal.server"; +import { resolveRealtimeStreamClient } from "~/services/realtime/resolveRealtimeStreamClient.server"; import { anyResource, createLoaderApiRoute, @@ -39,7 +39,11 @@ export const loader = createLoaderApiRoute( }, }, async ({ searchParams, authentication, request, apiVersion }) => { - return realtimeClient.streamRuns( + // Pick the Electric proxy or the notifier-backed tag-list feed per org + // (defaults to Electric). Both implement streamRuns. + const client = await resolveRealtimeStreamClient(authentication.environment); + + return client.streamRuns( request.url, authentication.environment, searchParams, diff --git a/apps/webapp/app/services/realtime/boundedTtlCache.ts b/apps/webapp/app/services/realtime/boundedTtlCache.ts new file mode 100644 index 00000000000..643f23607c5 --- /dev/null +++ b/apps/webapp/app/services/realtime/boundedTtlCache.ts @@ -0,0 +1,57 @@ +/** + * Tiny in-process bounded TTL cache shared by the realtime feeds. + * + * Entries expire after `ttlMs`. An expired entry is evicted when read (`get`); on + * write, if the cache is at `maxEntries`, expired entries are swept and, if it's + * still full (pathologically all live), the oldest insertion is dropped. Node is + * single-threaded so no locking is needed. Used where a miss is cheap and + * correctness-safe (read-through hydration, per-handle working sets, per-org flag + * resolution). + * + * A stored value of `undefined` cannot be distinguished from a miss; callers that + * need to cache "absence" should store an explicit sentinel (e.g. `null`). + */ +export class BoundedTtlCache { + readonly #entries = new Map(); + + constructor( + private readonly ttlMs: number, + private readonly maxEntries: number + ) {} + + get(key: string): V | undefined { + const entry = this.#entries.get(key); + if (!entry) { + return undefined; + } + if (entry.expiresAt > Date.now()) { + return entry.value; + } + // Evict on read so expired entries don't linger until the next at-capacity + // sweep — important for read-heavy / low-churn caches (per-handle working sets). + this.#entries.delete(key); + return undefined; + } + + set(key: string, value: V): void { + if (this.#entries.size >= this.maxEntries) { + const now = Date.now(); + for (const [key, entry] of this.#entries) { + if (entry.expiresAt <= now) { + this.#entries.delete(key); + } + } + if (this.#entries.size >= this.maxEntries) { + const oldest = this.#entries.keys().next().value; + if (oldest !== undefined) { + this.#entries.delete(oldest); + } + } + } + this.#entries.set(key, { value, expiresAt: Date.now() + this.ttlMs }); + } + + get size(): number { + return this.#entries.size; + } +} diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts new file mode 100644 index 00000000000..545c4a43211 --- /dev/null +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -0,0 +1,40 @@ +import { type ClickHouse } from "@internal/clickhouse"; +import { type PrismaClientOrTransaction } from "~/db.server"; +import { RunsRepository } from "~/services/runsRepository/runsRepository.server"; +import { type RunListFilter, type RunListResolver } from "./runReader.server"; + +export type ClickHouseRunListResolverOptions = { + /** Resolves the per-organization ClickHouse client (multi-tenant routing). */ + getClickhouse: (organizationId: string) => Promise; + prisma: PrismaClientOrTransaction; +}; + +/** + * Resolves the realtime tag/list filter into matching run ids via ClickHouse + * `listRunIds`. Tag matching is contains-ANY (OR), the same + * semantics the dashboard runs list uses. Filter-only: ids only, hydrated from + * Postgres by id afterward. This keeps the realtime tag feed off the Postgres + * `runTags` GIN index entirely. + * + * (Multi-tag subscribeToRunsWithTag is therefore OR, not the AND that Electric's + * `runTags @> ARRAY[...]` shape used. Restoring AND is a follow-up: add a + * `hasAll` mode to the ClickHouse runs filter and use it here.) + */ +export class ClickHouseRunListResolver implements RunListResolver { + constructor(private readonly options: ClickHouseRunListResolverOptions) {} + + async resolveMatchingRunIds(filter: RunListFilter): Promise { + const clickhouse = await this.options.getClickhouse(filter.organizationId); + const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); + + return repository.listRunIds({ + organizationId: filter.organizationId, + projectId: filter.projectId, + environmentId: filter.environmentId, + tags: filter.tags && filter.tags.length > 0 ? filter.tags : undefined, + batchId: filter.batchId, + from: filter.createdAtAfter?.getTime(), + page: { size: filter.limit }, + }); + } +} diff --git a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts new file mode 100644 index 00000000000..c7c90a7f17b --- /dev/null +++ b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts @@ -0,0 +1,301 @@ +/** + * Electric HTTP shape-stream wire protocol serializer for the single-run feed. + * + * This re-emits the exact wire shape that the deployed `@electric-sql/client` + * (1.0.14 modern + 0.4.0 legacy) and the SDK's `SubscribeRunRawShape` expect, + * so the notifier-backed realtime feed stays byte-faithful to what those clients + * already expect. + * + * The module is intentionally pure: no DB, Redis, or env access, so the wire + * contract can be unit-tested by round-tripping through the real client parser + * + the SDK schema. Header rewrites, tokens, and transport live in the client. + * + * Wire facts this encodes (verified against @electric-sql/client@1.0.14): + * - Response body is a JSON array of messages; an empty body is treated as `[]`. + * - Each column value is wire-encoded as a STRING (or null); the client decodes + * it back using the per-column `electric-schema` header. Columns absent from + * the schema are passed through unparsed (so text/timestamp stay strings). + * - `up-to-date` is the only control message that makes the client emit rows. + * - Re-sending the full row each cycle is idempotent: the client merges by `key`. + */ + +export type ElectricColumnType = + | "text" + | "timestamp" + | "int4" + | "int8" + | "float8" + | "bool" + | "jsonb"; + +type ElectricColumn = { + name: string; + type: ElectricColumnType; + /** Array dimensionality. 1 => `type[]` (Postgres `{a,b}` literal). */ + dims?: number; + /** + * Array columns only. True when the Postgres column has NO default, so an + * empty/absent value is stored as SQL NULL (Electric emits `null`) rather than + * an empty-array literal `{}`. Prisma erases this distinction — it coerces both + * NULL and `{}` to `[]` on read — so we re-derive the wire form from the column's + * known schema. `runTags` has no default; `realtimeStreams` has `@default([])`. + */ + emptyArrayAsNull?: boolean; +}; + +/** + * The columns the realtime run feed exposes, mirroring `DEFAULT_ELECTRIC_COLUMNS` + * in `realtimeClient.server.ts` and their Postgres types from the `TaskRun` + * Prisma model. The `type`/`dims` drive both the `electric-schema` header and + * the value encoding. Keep in sync with `DEFAULT_ELECTRIC_COLUMNS`. + */ +export const RUN_ELECTRIC_COLUMNS: ReadonlyArray = [ + { name: "id", type: "text" }, + { name: "taskIdentifier", type: "text" }, + { name: "createdAt", type: "timestamp" }, + { name: "updatedAt", type: "timestamp" }, + { name: "startedAt", type: "timestamp" }, + { name: "delayUntil", type: "timestamp" }, + { name: "queuedAt", type: "timestamp" }, + { name: "expiredAt", type: "timestamp" }, + { name: "completedAt", type: "timestamp" }, + { name: "friendlyId", type: "text" }, + { name: "number", type: "int4" }, + { name: "isTest", type: "bool" }, + { name: "status", type: "text" }, + { name: "usageDurationMs", type: "int4" }, + { name: "costInCents", type: "float8" }, + { name: "baseCostInCents", type: "float8" }, + { name: "ttl", type: "text" }, + { name: "payload", type: "text" }, + { name: "payloadType", type: "text" }, + { name: "metadata", type: "text" }, + { name: "metadataType", type: "text" }, + { name: "output", type: "text" }, + { name: "outputType", type: "text" }, + { name: "runTags", type: "text", dims: 1, emptyArrayAsNull: true }, + { name: "error", type: "jsonb" }, + { name: "realtimeStreams", type: "text", dims: 1 }, +]; + +/** Columns that can never be skipped via `skipColumns` (mirrors realtimeClient). */ +export const RESERVED_COLUMNS = ["id", "taskIdentifier", "friendlyId", "status", "createdAt"]; + +/** + * Shape of a single run hydrated for the realtime feed. Structurally compatible + * with the Prisma `TaskRun` projection produced by `RunHydrator`. + */ +export type RealtimeRunRow = { + id: string; + taskIdentifier: string; + createdAt: Date; + updatedAt: Date; + startedAt: Date | null; + delayUntil: Date | null; + queuedAt: Date | null; + expiredAt: Date | null; + completedAt: Date | null; + friendlyId: string; + number: number; + isTest: boolean; + status: string; + usageDurationMs: number; + costInCents: number; + baseCostInCents: number; + ttl: string | null; + payload: string; + payloadType: string; + metadata: string | null; + metadataType: string; + output: string | null; + outputType: string; + runTags: string[]; + error: unknown; + realtimeStreams: string[]; +}; + +type Operation = "insert" | "update" | "delete"; + +type ChangeMessage = { + key: string; + value: Record; + headers: { operation: Operation }; +}; + +type ControlMessage = { + headers: { control: "up-to-date" | "must-refetch" }; +}; + +type ShapeMessage = ChangeMessage | ControlMessage; + +const UP_TO_DATE: ControlMessage = { headers: { control: "up-to-date" } }; + +function effectiveSkipColumns(skipColumns: string[]): Set { + return new Set(skipColumns.filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c))); +} + +function quoteArrayElement(value: string): string { + return `"${value.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`; +} + +function pgArrayLiteral(values: unknown[]): string { + if (values.length === 0) { + return "{}"; + } + return `{${values.map((v) => quoteArrayElement(String(v))).join(",")}}`; +} + +function serializeValue(value: unknown, column: ElectricColumn): string | null { + if (value === null || value === undefined) { + return null; + } + + if (column.dims && column.dims > 0) { + if (!Array.isArray(value)) { + return null; + } + // A no-default array column stores NULL when empty, so Electric emits `null` + // (not `{}`); match that here since Prisma handed us `[]` for the NULL value. + if (value.length === 0 && column.emptyArrayAsNull) { + return null; + } + return pgArrayLiteral(value); + } + + switch (column.type) { + case "bool": + // Postgres text representation; the client's parseBool accepts "t"/"f". + return value ? "t" : "f"; + case "timestamp": + // The SDK's RawShapeDate appends "Z" before parsing, so we emit the ISO + // string WITHOUT the trailing "Z". + return value instanceof Date ? value.toISOString().slice(0, -1) : String(value); + case "jsonb": + return JSON.stringify(value); + case "int4": + case "int8": + case "float8": + case "text": + default: + return String(value); + } +} + +/** The merge key the client uses to reassemble a row across insert/update cycles. */ +export function runShapeKey(runId: string): string { + return `"public"."TaskRun"/"${runId}"`; +} + +/** Encode a single run row into the wire `value` object (column -> string|null). */ +export function serializeRunRow( + row: RealtimeRunRow, + skipColumns: string[] = [] +): Record { + const skip = effectiveSkipColumns(skipColumns); + const value: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + value[column.name] = serializeValue((row as Record)[column.name], column); + } + + return value; +} + +/** The `electric-schema` response header value for the (optionally trimmed) column set. */ +export function buildElectricSchemaHeader(skipColumns: string[] = []): string { + const skip = effectiveSkipColumns(skipColumns); + const schema: Record = {}; + + for (const column of RUN_ELECTRIC_COLUMNS) { + if (skip.has(column.name)) { + continue; + } + schema[column.name] = column.dims ? { type: column.type, dims: column.dims } : { type: column.type }; + } + + return JSON.stringify(schema); +} + +/** + * Initial snapshot body: a single `insert` for the row (if it exists) followed by + * `up-to-date`. An absent row emits a bare `up-to-date` (an empty shape), which is + * how Electric represents "no rows match". + */ +export function buildSnapshotBody(row: RealtimeRunRow | null, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = []; + if (row) { + messages.push({ + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "insert" }, + }); + } + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +/** Live body when the row advanced: a full-row `update` followed by `up-to-date`. */ +export function buildUpdateBody(row: RealtimeRunRow, skipColumns: string[] = []): string { + const messages: ShapeMessage[] = [ + { + key: runShapeKey(row.id), + value: serializeRunRow(row, skipColumns), + headers: { operation: "update" }, + }, + UP_TO_DATE, + ]; + return JSON.stringify(messages); +} + +/** Live body when nothing advanced: a bare `up-to-date` (no row emission). */ +export function buildUpToDateBody(): string { + return JSON.stringify([UP_TO_DATE]); +} + +export type RowChange = { row: RealtimeRunRow; operation: "insert" | "update" }; + +/** + * Multi-row body for the tag-list feed: one change message per row (insert for + * rows new to the shape, update for rows that advanced) followed by `up-to-date`. + * An empty `changes` array emits a bare `up-to-date`. The client merges every row + * by key, so re-emitting a full row is idempotent. + */ +export function buildRowsBody(changes: RowChange[], skipColumns: string[] = []): string { + const messages: ShapeMessage[] = changes.map((change) => ({ + key: runShapeKey(change.row.id), + value: serializeRunRow(change.row, skipColumns), + headers: { operation: change.operation }, + })); + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + +export const INITIAL_OFFSET = "-1"; + +/** + * Opaque offset token, formatted to satisfy the client's `${number}_${number}` + * type. The first segment is the row's `updatedAt` epoch-ms (lets a live request + * detect whether the replica row has advanced past what the client already has); + * the second is a per-connection sequence counter. + */ +export function encodeOffset(updatedAtMs: number, seq: number): string { + return `${Math.trunc(updatedAtMs)}_${Math.trunc(seq)}`; +} + +/** Extract the `updatedAt` epoch-ms a client last saw from its echoed offset. */ +export function parseOffsetUpdatedAtMs(offset: string | null | undefined): number { + if (!offset) { + return 0; + } + const [first] = offset.split("_"); + const value = Number(first); + return Number.isFinite(value) && value > 0 ? value : 0; +} + +/** Mirror of realtimeClient's DEQUEUED->EXECUTING rewrite for non-current API versions. */ +export function rewriteBodyForLegacyApiVersion(body: string): string { + return body.replace(/"status":"DEQUEUED"/g, '"status":"EXECUTING"'); +} diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts new file mode 100644 index 00000000000..9c70fd1acb9 --- /dev/null +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -0,0 +1,752 @@ +import { json } from "@remix-run/server-runtime"; +import { safeParseNaturalLanguageDurationAgo } from "@trigger.dev/core/v3/isomorphic"; +import { randomUUID } from "node:crypto"; +import { API_VERSIONS, CURRENT_API_VERSION } from "~/api/versions"; +import { + type CachedLimitProvider, + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { logger } from "../logger.server"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + INITIAL_OFFSET, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + RESERVED_COLUMNS, + type RowChange, +} from "./electricStreamProtocol.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RunChangeNotifier, type RunChangeSubscription } from "./runChangeNotifier.server"; +import { type RunHydrator, type RunListResolver } from "./runReader.server"; +import { type RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; + +/** The tag-list feed resolves ids via ClickHouse, which needs org + project + env. + * `authentication.environment` (AuthenticatedEnvironment) provides projectId, so + * widening here avoids touching the Electric client's RealtimeEnvironment type. */ +export type RealtimeListEnvironment = RealtimeEnvironment & { projectId: string }; + +/** The realtime feeds the run routes depend on (single-run, tag-list, batch). Both + * the Electric client and this notifier client satisfy it, so the routes can switch + * between them behind a flag. */ +export interface RealtimeStreamClient { + streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; + streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise; +} + +export type WakeupReason = "notify" | "timeout" | "abort"; + +export type NotifierRealtimeClientOptions = { + runReader: RunHydrator; + /** Resolves the tag/list filter into the matching id-set (filter-only). */ + runListResolver: RunListResolver; + notifier: RunChangeNotifier; + limiter: RealtimeConcurrencyLimiter; + cachedLimitProvider: CachedLimitProvider; + /** Backstop wait before refetching on a live request (ms). Defaults to 5000. */ + livePollTimeoutMs?: number; + /** Ceiling for the tag-list createdAt lookback window (ms). */ + maximumCreatedAtFilterAgeMs: number; + /** Hard cap on tag-list snapshot size. Defaults to 1000. */ + maxListResults?: number; + /** TTL (ms) for the multi-run resolve+hydrate coalescing cache. Defaults to 1000. */ + runSetResolveCacheTtlMs?: number; + /** Max entries in the resolve+hydrate cache. Defaults to 5000. */ + runSetResolveCacheMaxEntries?: number; + /** Max entries in the per-handle working-set cache. Defaults to 10000. */ + listCacheMaxEntries?: number; + /** Epoch-aligned bucket (ms) the tag-list createdAt lower bound is floored to, so + * same-tag feeds pinned within the same bucket share a cache entry. Defaults to + * 60000. 0 disables bucketing. */ + runSetCreatedAtBucketMs?: number; + /** Observability hook: why a live request woke (notify vs timeout vs abort). */ + onWakeup?: (reason: WakeupReason) => void; + /** Observability hook: whether a multi-run resolve hit the cache, coalesced onto + * an in-flight resolve, or missed (issued fresh ClickHouse + Postgres queries). */ + onRunSetResolve?: (result: "hit" | "miss" | "coalesced") => void; + /** Observability hook: latency (ms) of the ClickHouse resolve / Postgres hydrate. */ + onRunSetQuery?: (stage: "resolve" | "hydrate", ms: number) => void; +}; + +const DEFAULT_CONCURRENCY_LIMIT = 100_000; +const DEFAULT_LIVE_POLL_TIMEOUT_MS = 5_000; +const DEFAULT_MAX_LIST_RESULTS = 1_000; +const LIST_CACHE_TTL_MS = 5 * 60_000; +const LIST_CACHE_MAX_ENTRIES = 10_000; +const DEFAULT_RUNSET_CACHE_TTL_MS = 1_000; +const DEFAULT_RUNSET_CACHE_MAX_ENTRIES = 5_000; +const DEFAULT_RUNSET_CREATED_AT_BUCKET_MS = 60_000; + +/** A multi-run feed's filter. Tag-list sets `tags` (+ pinned `createdAtAfter`); + * the batch feed sets `batchId`. Both resolve to an id-set via the resolver. */ +type RunSetFilter = { + tags?: string[]; + batchId?: string; + createdAtAfter?: Date; +}; + +/** Per-handle working set: runId -> last-emitted updatedAt (ms), so live polls + * emit only rows that advanced. */ +type WorkingSet = Map; + +type ResponseHeaderInput = { + offset: string; + handle: string; + cursor?: string; + schema?: string; +}; + +/** + * Notifier-backed implementation of the realtime run feeds: signals run changes + * over Redis pub/sub and refetches the current rows from a read replica. + * + * Single-run (`streamRun`): + * - initial (`offset=-1`): hydrate + emit `insert` + `up-to-date` (with schema) + * - live: race a per-run notification vs a ~5s backstop and the abort signal, + * refetch, and emit a full-row `update` ONLY when `updatedAt` advanced past what + * the client has (a stale replica read never regresses); else a bare `up-to-date`. + * + * Multi-run feeds (`streamRuns` tag-list, `streamBatch`) share one core: + * - initial: resolve the matching id-set via ClickHouse `listRunIds` (filter-only, + * tag-OR or batchId), hydrate by-id from Postgres, emit N `insert`s. + * - live: one per-env subscription wakes the feed; re-resolve the set, hydrate it, + * and emit only new (`insert`) / advanced (`update`) rows — diffed on the + * authoritative Postgres `updatedAt` against a per-handle working set (cache miss + * falls back to the offset floor, merge-safe). ClickHouse supplies membership; + * Postgres supplies fresh row state, so CH ingest lag never stales the rows. + * Tag-list pins its `createdAt` window in the handle; batch needs no window. + * + * Tokens are opaque: `offset` = `_`, `handle` is per-shape, + * `cursor` is a live-only counter. The wire format is produced by + * `electricStreamProtocol`. + */ +export class NotifierRealtimeClient implements RealtimeStreamClient { + #seq = 0; + readonly #workingSetCache: BoundedTtlCache; + /** Coalescing cache for the multi-run (resolveIds -> hydrateByIds) pair, keyed by + * (env, filter, columns). Collapses an env-wide wake's per-feed query fan-out into + * one shared resolve+hydrate per filter per short window. */ + readonly #runSetCache: BoundedTtlCache; + readonly #runSetInflight = new Map>(); + + constructor(private readonly options: NotifierRealtimeClientOptions) { + this.#workingSetCache = new BoundedTtlCache( + LIST_CACHE_TTL_MS, + options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES + ); + this.#runSetCache = new BoundedTtlCache( + options.runSetResolveCacheTtlMs ?? DEFAULT_RUNSET_CACHE_TTL_MS, + options.runSetResolveCacheMaxEntries ?? DEFAULT_RUNSET_CACHE_MAX_ENTRIES + ); + } + + /** Current size of the per-handle working-set cache (for a metrics gauge). */ + get workingSetCacheSize(): number { + return this.#workingSetCache.size; + } + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + // Initial snapshot — no prior offset/handle. + if (offset === INITIAL_OFFSET || !handle) { + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion); + } + + if (isLive) { + return this.#liveResponse({ + environment, + runId, + offset, + handle, + skipColumns, + apiVersion, + clientVersion, + signal, + }); + } + + // Non-live catch-up with a handle: re-emit the current snapshot (idempotent). + const row = await this.options.runReader.getRunById(environment.id, runId); + return this.#snapshotResponse(runId, row, skipColumns, apiVersion, clientVersion, handle); + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + const tags = params.tags ?? []; + + // Initial snapshot — pin the createdAt window in a fresh handle. + if (offset === INITIAL_OFFSET || !handle) { + const createdAtFilterMs = this.#computeCreatedAtFilter(params.createdAt).getTime(); + return this.#runSetSnapshotResponse( + environment, + { tags, createdAtAfter: new Date(createdAtFilterMs) }, + this.#mintListHandle(createdAtFilterMs), + skipColumns, + apiVersion, + clientVersion + ); + } + + // Recover the pinned window from the handle so the lower bound never drifts. + const filter: RunSetFilter = { + tags, + createdAtAfter: new Date( + this.#filterMsFromHandle(handle) ?? this.#computeCreatedAtFilter(params.createdAt).getTime() + ), + }; + + if (isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Non-live catch-up under the same handle. + return this.#runSetSnapshotResponse( + environment, + filter, + handle, + skipColumns, + apiVersion, + clientVersion + ); + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const { offset, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + + // The batch set is fully defined by batchId (the route resolves it from the + // friendlyId on every request), so the handle is derived and stable and there's + // no createdAt window to pin. + const handle = `batch-${batchId}`; + const filter: RunSetFilter = { batchId }; + + if (offset !== INITIAL_OFFSET && isLive) { + return this.#runSetLiveResponse( + environment, + filter, + handle, + offset, + skipColumns, + apiVersion, + clientVersion, + signal + ); + } + + // Initial snapshot + non-live catch-up. + return this.#runSetSnapshotResponse( + environment, + filter, + handle, + skipColumns, + apiVersion, + clientVersion + ); + } + + #snapshotResponse( + runId: string, + row: Awaited>, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string, + existingHandle?: string + ): Response { + const body = buildSnapshotBody(row, skipColumns); + const offset = row ? encodeOffset(row.updatedAt.getTime(), this.#nextSeq()) : encodeOffset(0, 0); + return this.#buildResponse(body, apiVersion, clientVersion, { + offset, + handle: existingHandle ?? this.#mintHandle(runId), + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + async #liveResponse(params: { + environment: RealtimeEnvironment; + runId: string; + offset: string; + handle: string; + skipColumns: string[]; + apiVersion: API_VERSIONS; + clientVersion?: string; + signal?: AbortSignal; + }): Promise { + const { environment, runId, offset, handle, skipColumns, apiVersion, clientVersion, signal } = + params; + + return this.#withConcurrencySlot(environment, async () => { + const reason = await this.#waitForChange(runId, signal); + this.options.onWakeup?.(reason); + + const row = await this.options.runReader.getRunById(environment.id, runId); + const lastSeenMs = parseOffsetUpdatedAtMs(offset); + const seq = this.#nextSeq(); + + // Only-on-advance: emit a full-row update when the replica row moved past + // what the client already has; otherwise a bare up-to-date keeps the offset. + // Live responses carry electric-cursor but NOT electric-schema (the client + // already has the schema from the initial snapshot) — matching real Electric. + if (row && row.updatedAt.getTime() > lastSeenMs) { + return this.#buildResponse(buildUpdateBody(row, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(row.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + }); + } + + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(seq), + }); + }); + } + + /** Initial (and non-live catch-up) snapshot for a multi-run feed: resolve the + * id-set, hydrate, emit every row as an `insert`, and seed the working set. */ + async #runSetSnapshotResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion?: string + ): Promise { + const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); + + const changes: RowChange[] = rows.map((row) => ({ row, operation: "insert" as const })); + + // updatedAt comes from the authoritative Postgres hydrate, not ClickHouse. + const seen: WorkingSet = new Map(); + let maxUpdatedAt = 0; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + seen.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + } + this.#workingSetCache.set(handle, seen); + + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, this.#nextSeq()), + handle, + schema: buildElectricSchemaHeader(skipColumns), + }); + } + + /** Live poll for a multi-run feed: wait, re-resolve the set, and emit only the + * rows that are new or advanced vs the cached working set. */ + async #runSetLiveResponse( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + handle: string, + offset: string, + skipColumns: string[], + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + signal: AbortSignal | undefined + ): Promise { + return this.#withConcurrencySlot(environment, async () => { + // One env-scoped subscription per feed (not one per run): any run change in + // the env wakes us, then we re-resolve the filter. + const reason = await this.#waitForEnvChange(environment.id, signal); + this.options.onWakeup?.(reason); + + const cached = this.#workingSetCache.get(handle); + const offsetFloorMs = parseOffsetUpdatedAtMs(offset); + const seq = this.#nextSeq(); + + // ClickHouse resolves the (possibly stale) membership; Postgres hydrates the + // authoritative current rows, so status is always fresh even if CH lags. The + // resolve+hydrate is coalesced + short-TTL cached so a single env-wide wake + // doesn't fan out into one CH+PG query per concurrent same-filter feed. + const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); + + // Diff against what the client already has, using the hydrated updatedAt: + // cache hit => per-row (new = insert, advanced = update); miss => anything + // newer than the offset floor as a merge-safe update. + const changes: RowChange[] = []; + const seen: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + seen.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (cached) { + const prior = cached.get(row.id); + if (prior === undefined) { + changes.push({ row, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ row, operation: "update" }); + } + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ row, operation: "update" }); + } + } + + // Refresh the working set so runs that left the filter stop being tracked + // (the client keeps showing them; the SDK never applies deletes). + this.#workingSetCache.set(handle, seen); + + const body = changes.length === 0 ? buildUpToDateBody() : buildRowsBody(changes, skipColumns); + + return this.#buildResponse(body, apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }); + } + + /** + * Resolve the filter's id-set (ClickHouse) and hydrate the rows (Postgres), + * coalesced + short-TTL cached by (env, filter, columns). Every batch feed for a + * batch, and every tag feed sharing tags+window+columns, shares ONE resolve+hydrate + * instead of each firing its own when the per-env channel wakes them together. + * Concurrent callers await an in-flight resolve; callers within the TTL reuse the + * cached rows (staleness budget: up to the TTL; the next live poll catches up). + */ + async #resolveAndHydrate( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const key = this.#runSetCacheKey(environment.id, filter, skipColumns); + + const cached = this.#runSetCache.get(key); + if (cached) { + this.options.onRunSetResolve?.("hit"); + return cached; + } + + const existing = this.#runSetInflight.get(key); + if (existing) { + this.options.onRunSetResolve?.("coalesced"); + return existing; + } + + this.options.onRunSetResolve?.("miss"); + const promise = this.#resolveAndHydrateUncached(environment, filter, skipColumns) + .then((rows) => { + this.#runSetCache.set(key, rows); + return rows; + }) + .finally(() => { + this.#runSetInflight.delete(key); + }); + + this.#runSetInflight.set(key, promise); + return promise; + } + + async #resolveAndHydrateUncached( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + const resolveStart = Date.now(); + const ids = await this.#resolveIds(environment, filter); + this.options.onRunSetQuery?.("resolve", Date.now() - resolveStart); + + const hydrateStart = Date.now(); + const rows = await this.options.runReader.hydrateByIds(environment.id, ids, skipColumns); + this.options.onRunSetQuery?.("hydrate", Date.now() - hydrateStart); + + return rows; + } + + /** Stable cache key for the resolve+hydrate cache. Same key => same id-set and the + * same projected columns, so cached rows always match the requesting feed. */ + #runSetCacheKey(environmentId: string, filter: RunSetFilter, skipColumns: string[]): string { + const tags = filter.tags && filter.tags.length > 0 ? [...filter.tags].sort().join(",") : ""; + const cols = skipColumns.length > 0 ? [...skipColumns].sort().join(",") : ""; + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + return `${environmentId}|${tags}|${filter.batchId ?? ""}|${ + filter.createdAtAfter?.getTime() ?? "" + }|${maxListResults}|${cols}`; + } + + async #resolveIds(environment: RealtimeListEnvironment, filter: RunSetFilter): Promise { + const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; + const ids = await this.options.runListResolver.resolveMatchingRunIds({ + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: filter.tags, + batchId: filter.batchId, + createdAtAfter: filter.createdAtAfter, + limit: maxListResults, + }); + + if (ids.length >= maxListResults) { + logger.warn("[notifierRealtimeClient] run-set feed hit the result cap", { + environmentId: environment.id, + filter, + cap: maxListResults, + }); + } + + return ids; + } + + #computeCreatedAtFilter(createdAt: string | undefined): Date { + // Clamp to the maximum lookback window, mirroring realtimeClient. + const floor = new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs); + const parsed = safeParseNaturalLanguageDurationAgo(createdAt ?? "24h"); + const resolved = !parsed || parsed < floor ? floor : parsed; + // Quantize the lower bound to a coarse epoch-aligned bucket and pin THAT in the + // handle, so same-tag feeds whose windows land in the same bucket resolve to the + // same filter -> same coalescing cache key -> one shared ClickHouse + Postgres + // query instead of one per feed. Floored (rounds the bound earlier), so the + // window only ever widens by < bucket and never drops a run the client should see. + return new Date(this.#bucketCreatedAtMs(resolved.getTime())); + } + + #bucketCreatedAtMs(ms: number): number { + const bucket = this.options.runSetCreatedAtBucketMs ?? DEFAULT_RUNSET_CREATED_AT_BUCKET_MS; + return bucket > 0 ? Math.floor(ms / bucket) * bucket : ms; + } + + #mintListHandle(createdAtFilterMs: number): string { + // Pins the createdAt threshold in the opaque handle so live polls reuse the + // same lower bound even on a working-set cache miss. + return `runs_${Math.trunc(createdAtFilterMs)}_${this.#nextSeq()}`; + } + + #filterMsFromHandle(handle: string): number | undefined { + const parts = handle.split("_"); + if (parts[0] !== "runs") { + return undefined; + } + const ms = Number(parts[1]); + return Number.isFinite(ms) && ms > 0 ? ms : undefined; + } + + #parseStreamRequest( + url: URL | string, + requestOptions?: RealtimeRequestOptions + ): { offset: string; handle: string | null; isLive: boolean; skipColumns: string[] } { + const $url = new URL(url.toString()); + return { + offset: $url.searchParams.get("offset") ?? INITIAL_OFFSET, + handle: $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"), + isLive: $url.searchParams.get("live") === "true", + skipColumns: this.#resolveSkipColumns($url, requestOptions), + }; + } + + /** + * Runs `work` inside a per-env concurrency slot: acquires a slot (429 if over the + * org limit, 500 if the limit can't be read) and always releases it afterward. + */ + async #withConcurrencySlot( + environment: RealtimeEnvironment, + work: () => Promise + ): Promise { + const requestId = randomUUID(); + const concurrencyLimit = await this.options.cachedLimitProvider.getCachedLimit( + environment.organizationId, + DEFAULT_CONCURRENCY_LIMIT + ); + + if (!concurrencyLimit) { + logger.error("[notifierRealtimeClient] Failed to get concurrency limit", { + organizationId: environment.organizationId, + }); + return json({ error: "Failed to get concurrency limit" }, { status: 500 }); + } + + const canProceed = await this.options.limiter.incrementAndCheck( + environment.id, + requestId, + concurrencyLimit + ); + + if (!canProceed) { + return json({ error: "Too many concurrent requests" }, { status: 429 }); + } + + try { + return await work(); + } finally { + await this.options.limiter.decrement(environment.id, requestId); + } + } + + #waitForChange(runId: string, signal?: AbortSignal): Promise { + return this.#waitForSubscription(this.options.notifier.subscribeToRunChanges(runId), signal); + } + + #waitForEnvChange(environmentId: string, signal?: AbortSignal): Promise { + return this.#waitForSubscription( + this.options.notifier.subscribeToEnvChanges(environmentId), + signal + ); + } + + /** Race a notifier subscription against the backstop timeout and the abort signal. */ + async #waitForSubscription( + subscription: RunChangeSubscription, + signal?: AbortSignal + ): Promise { + if (signal?.aborted) { + subscription.unsubscribe(); + return "abort"; + } + + let timer: ReturnType | undefined; + let onAbort: (() => void) | undefined; + + try { + return await new Promise((resolve) => { + subscription.changed.then(() => resolve("notify")).catch(() => resolve("timeout")); + + timer = setTimeout(() => resolve("timeout"), this.#jitteredTimeout()); + + if (signal) { + onAbort = () => resolve("abort"); + signal.addEventListener("abort", onAbort, { once: true }); + } + }); + } finally { + if (timer) { + clearTimeout(timer); + } + if (signal && onAbort) { + signal.removeEventListener("abort", onAbort); + } + subscription.unsubscribe(); + } + } + + #jitteredTimeout(): number { + const base = this.options.livePollTimeoutMs ?? DEFAULT_LIVE_POLL_TIMEOUT_MS; + // +/-15% jitter to avoid synchronized refetch herds. + return Math.round(base * (0.85 + Math.random() * 0.3)); + } + + #buildResponse( + body: string, + apiVersion: API_VERSIONS, + clientVersion: string | undefined, + headers: ResponseHeaderInput + ): Response { + const finalBody = + apiVersion === CURRENT_API_VERSION ? body : rewriteBodyForLegacyApiVersion(body); + + const responseHeaders = new Headers(); + responseHeaders.set("content-type", "application/json"); + responseHeaders.set("cache-control", "no-store"); + + // Carry CORS on the response itself, mirroring how the Electric upstream does + // (apiCors passes a response through untouched once it has allow-origin). Browsers + // can only read the electric-* headers cross-origin if they're explicitly exposed; + // without this the deployed react-hooks fail with MissingHeadersError. Bearer-token + // requests are non-credentialed, so a wildcard is safe. + responseHeaders.set("access-control-allow-origin", "*"); + responseHeaders.set("access-control-expose-headers", "*"); + + // Modern clients (1.0.14) send `x-trigger-electric-version` and read the + // lowercase `electric-*` headers. Legacy clients (0.4.0) omit the version and + // read `electric-shape-id`/`electric-chunk-last-offset` (case-insensitive), + // matching realtimeClient's rewriteResponseHeaders behavior exactly. + if (clientVersion) { + responseHeaders.set("electric-offset", headers.offset); + responseHeaders.set("electric-handle", headers.handle); + } else { + responseHeaders.set("electric-chunk-last-offset", headers.offset); + responseHeaders.set("electric-shape-id", headers.handle); + } + + if (headers.cursor !== undefined) { + responseHeaders.set("electric-cursor", headers.cursor); + } + if (headers.schema !== undefined) { + responseHeaders.set("electric-schema", headers.schema); + } + + return new Response(finalBody, { status: 200, headers: responseHeaders }); + } + + #mintHandle(runId: string): string { + // Stable per-run handle: the single-run shape never changes columns, so the + // client never needs a must-refetch from a handle change. + return `run-${runId}`; + } + + #nextSeq(): number { + this.#seq = (this.#seq + 1) % Number.MAX_SAFE_INTEGER; + return this.#seq; + } + + #resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); + } +} diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..2888deec863 --- /dev/null +++ b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts @@ -0,0 +1,99 @@ +import { Counter, Gauge, Histogram } from "prom-client"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { singleton } from "~/utils/singleton"; +import { getCachedLimit } from "../platform.v3.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { NotifierRealtimeClient } from "./notifierRealtimeClient.server"; +import { RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; +import { getRunChangeNotifier } from "./runChangeNotifierInstance.server"; +import { RunHydrator } from "./runReader.server"; + +/** + * Process-singleton wiring for the notifier-backed realtime client. Only + * constructed when a request actually routes to the + * notifier backend, so a disabled webapp never instantiates it. + */ +function initializeNotifierRealtimeClient(): NotifierRealtimeClient { + const wakeups = new Counter({ + name: "realtime_notifier_wakeups_total", + help: "Live realtime notifier wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishRunChanged delegate.", + labelNames: ["reason"] as const, + registers: [metricsRegister], + }); + + const runSetResolves = new Counter({ + name: "realtime_notifier_runset_resolve_total", + help: "Multi-run (tag-list/batch) resolve+hydrate outcomes. 'hit'/'coalesced' vs 'miss' shows how effectively concurrent same-filter feeds share a single ClickHouse + Postgres query under an env-wide wake.", + labelNames: ["result"] as const, + registers: [metricsRegister], + }); + + const runSetQueryMs = new Histogram({ + name: "realtime_notifier_runset_query_ms", + help: "Latency of the multi-run resolve (ClickHouse) and hydrate (Postgres) stages.", + labelNames: ["stage"] as const, + buckets: [1, 5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000], + registers: [metricsRegister], + }); + + const limiter = new RealtimeConcurrencyLimiter({ + keyPrefix: "tr:realtime:notifier:concurrency", + redis: { + port: env.RATE_LIMIT_REDIS_PORT, + host: env.RATE_LIMIT_REDIS_HOST, + username: env.RATE_LIMIT_REDIS_USERNAME, + password: env.RATE_LIMIT_REDIS_PASSWORD, + tlsDisabled: env.RATE_LIMIT_REDIS_TLS_DISABLED === "true", + clusterMode: env.RATE_LIMIT_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + }); + + const client = new NotifierRealtimeClient({ + runReader: new RunHydrator({ replica: $replica }), + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + prisma: $replica, + }), + notifier: getRunChangeNotifier(), + limiter, + cachedLimitProvider: { + async getCachedLimit(organizationId, defaultValue) { + const result = await getCachedLimit( + organizationId, + "realtimeConcurrentConnections", + defaultValue + ); + return result.val; + }, + }, + livePollTimeoutMs: env.REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, + runSetResolveCacheTtlMs: env.REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS, + runSetResolveCacheMaxEntries: env.REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES, + listCacheMaxEntries: env.REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES, + runSetCreatedAtBucketMs: env.REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS, + onWakeup: (reason) => wakeups.inc({ reason }), + onRunSetResolve: (result) => runSetResolves.inc({ result }), + onRunSetQuery: (stage, ms) => runSetQueryMs.observe({ stage }, ms), + }); + + new Gauge({ + name: "realtime_notifier_working_set_size", + help: "Entries in the per-handle working-set cache (one per active multi-run feed session).", + registers: [metricsRegister], + collect() { + this.set(client.workingSetCacheSize); + }, + }); + + return client; +} + +export function getNotifierRealtimeClient(): NotifierRealtimeClient { + return singleton("notifierRealtimeClient", initializeNotifierRealtimeClient); +} diff --git a/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts new file mode 100644 index 00000000000..a935858fef0 --- /dev/null +++ b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts @@ -0,0 +1,111 @@ +import { Callback, Result } from "ioredis"; +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export type RealtimeConcurrencyLimiterOptions = { + redis: RedisWithClusterOptions; + keyPrefix: string; + /** How long a tracked request lives before it's swept as stale (seconds). */ + expiryTimeInSeconds?: number; + connectionName?: string; +}; + +/** + * Per-environment concurrent-connection limiter for realtime long-polls. + * + * This is a standalone copy of the limiter embedded in `realtimeClient.server.ts` + * (Electric path), so the notifier-backed client can enforce the same per-env cap + * WITHOUT modifying the existing Electric client. The Lua + key shape are + * identical; only the key prefix differs, so the two paths track independently. + */ +export class RealtimeConcurrencyLimiter { + private redis: RedisClient; + private expiryTimeInSeconds: number; + + constructor(private options: RealtimeConcurrencyLimiterOptions) { + this.redis = createRedisClient( + options.connectionName ?? "trigger:realtime:notifier:concurrency", + options.redis + ); + this.expiryTimeInSeconds = options.expiryTimeInSeconds ?? 60 * 5; + this.#registerCommands(); + } + + async incrementAndCheck(environmentId: string, requestId: string, limit: number): Promise { + const key = this.#getKey(environmentId); + const now = Date.now(); + + const result = await this.redis.incrementAndCheckRealtimeNotifierConcurrency( + key, + now.toString(), + requestId, + this.expiryTimeInSeconds.toString(), + (now - this.expiryTimeInSeconds * 1000).toString(), + limit.toString() + ); + + return result === 1; + } + + async decrement(environmentId: string, requestId: string): Promise { + const key = this.#getKey(environmentId); + await this.redis.zrem(key, requestId); + } + + #getKey(environmentId: string): string { + return `${this.options.keyPrefix}:${environmentId}`; + } + + #registerCommands() { + this.redis.defineCommand("incrementAndCheckRealtimeNotifierConcurrency", { + numberOfKeys: 1, + lua: /* lua */ ` + local concurrencyKey = KEYS[1] + + local timestamp = tonumber(ARGV[1]) + local requestId = ARGV[2] + local expiryTime = tonumber(ARGV[3]) + local cutoffTime = tonumber(ARGV[4]) + local limit = tonumber(ARGV[5]) + + -- Remove expired entries + redis.call('ZREMRANGEBYSCORE', concurrencyKey, '-inf', cutoffTime) + + -- Add the new request to the sorted set + redis.call('ZADD', concurrencyKey, timestamp, requestId) + + -- Set the expiry time on the key + redis.call('EXPIRE', concurrencyKey, expiryTime) + + -- Get the total number of concurrent requests + local totalRequests = redis.call('ZCARD', concurrencyKey) + + -- Check if the limit has been exceeded + if totalRequests > limit then + redis.call('ZREM', concurrencyKey, requestId) + return 0 + end + + return 1 + `, + }); + + this.redis.on("error", (error) => { + logger.error("[realtimeConcurrencyLimiter] redis error", { error }); + }); + } +} + +declare module "ioredis" { + interface RedisCommander { + incrementAndCheckRealtimeNotifierConcurrency( + key: string, + timestamp: string, + requestId: string, + expiryTime: string, + cutoffTime: string, + limit: string, + callback?: Callback + ): Result; + } +} diff --git a/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts new file mode 100644 index 00000000000..220f79f9308 --- /dev/null +++ b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts @@ -0,0 +1,86 @@ +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { FEATURE_FLAG } from "~/v3/featureFlags"; +import { makeFlag } from "~/v3/featureFlags.server"; +import { logger } from "../logger.server"; +import { type RealtimeEnvironment } from "../realtimeClient.server"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RealtimeStreamClient } from "./notifierRealtimeClient.server"; +import { getNotifierRealtimeClient } from "./notifierRealtimeClientInstance.server"; +import { getShadowRealtimeClient } from "./shadowRealtimeClientInstance.server"; + +type RealtimeBackend = "electric" | "notifier" | "shadow"; + +/** + * Chooses which backend serves a realtime run request. + * + * Two gates, both defaulting to the Electric path: + * 1. `REALTIME_NOTIFIER_ENABLED` (env master switch). When off, this returns the + * Electric client immediately — no flag read, no notifier client construction, + * byte-identical to pre-Electric-Sunset behavior. + * 2. the `realtimeBackend` feature flag (global + per-org, org wins), resolved per + * org and cached in-process for 30s so the long-poll feed doesn't hit the DB + * on every request. + */ +const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; +const BACKEND_CACHE_TTL_MS = 30_000; +// Org count is bounded, but cap to avoid unbounded growth. +const BACKEND_CACHE_MAX_ENTRIES = 50_000; + +const flag = makeFlag($replica); +const backendCache = new BoundedTtlCache( + BACKEND_CACHE_TTL_MS, + BACKEND_CACHE_MAX_ENTRIES +); + +export async function resolveRealtimeStreamClient( + environment: RealtimeEnvironment +): Promise { + if (!notifierEnabled) { + return realtimeClient; + } + + switch (await getRealtimeBackend(environment.organizationId)) { + case "notifier": + return getNotifierRealtimeClient(); + case "shadow": + // Client is still served Electric; the notifier path is diffed in the background. + return getShadowRealtimeClient(); + case "electric": + default: + return realtimeClient; + } +} + +async function getRealtimeBackend(organizationId: string): Promise { + const cached = backendCache.get(organizationId); + if (cached !== undefined) { + return cached; + } + + let backend: RealtimeBackend = "electric"; + + try { + const org = await $replica.organization.findFirst({ + where: { id: organizationId }, + select: { featureFlags: true }, + }); + + backend = await flag({ + key: FEATURE_FLAG.realtimeBackend, + defaultValue: "electric", + overrides: (org?.featureFlags as Record) ?? {}, + }); + } catch (error) { + // Never let a flag lookup failure break the realtime feed — fall back to Electric. + logger.error("[resolveRealtimeStreamClient] failed to resolve realtimeBackend flag", { + organizationId, + error, + }); + backend = "electric"; + } + + backendCache.set(organizationId, backend); + return backend; +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts new file mode 100644 index 00000000000..ba8748c6cf4 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -0,0 +1,228 @@ +import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; + +export type RunChangeInput = { + runId: string; + /** + * Optional. The single-run channel is keyed by runId alone; environmentId is + * carried for the per-env channels and metrics. Write sites that don't + * have it cheaply in scope may omit it. + */ + environmentId?: string; + /** Optional monotonic hint; not required since consumers always refetch. */ + version?: number; +}; + +export type RunChangeNotifierOptions = { + redis: RedisWithClusterOptions; + /** Channel name prefix; the runId is appended inside a hash-tag for slot locality. */ + channelPrefix?: string; + connectionName?: string; +}; + +export type RunChangeSubscription = { + /** Resolves the next time a change is published for the subscribed run. */ + changed: Promise; + unsubscribe: () => void; +}; + +const DEFAULT_CHANNEL_PREFIX = "realtime:"; + +/** + * RunChangeNotifier — the single, encapsulated module that carries "run X changed" + * signals from write sites to the realtime feed. + * + * Design constraints baked in here: + * - IDs only on the wire, never row data. Consumers refetch from Postgres. + * - ONE shared, multiplexed subscriber connection per process with a refcounted + * `Map>` (per-run + per-env channels). The RunQueue + * pattern, deliberately NOT + * the per-subscribe-connection pattern of ZodPubSub/tracePubSub (which would + * exhaust ElastiCache `maxclients`). + * - Connections are created lazily: a process that never publishes or subscribes + * (the default, flag-off state) opens no Redis connections at all. + * - `publish` is fire-and-forget and never throws; a dropped publish only costs + * latency because the consumer has a timeout backstop. + * + * Channels are hash-tagged (`{}`) so a later move to sharded + * pub/sub (SPUBLISH/SSUBSCRIBE) keeps slot locality without a channel rename. + */ +export class RunChangeNotifier { + #publisher: RedisClient | undefined; + #subscriber: RedisClient | undefined; + readonly #listeners = new Map void>>(); + readonly #channelPrefix: string; + readonly #connectionName: string; + + constructor(private readonly options: RunChangeNotifierOptions) { + this.#channelPrefix = options.channelPrefix ?? DEFAULT_CHANNEL_PREFIX; + this.#connectionName = options.connectionName ?? "trigger:realtime:run-change-notifier"; + } + + /** + * Fire-and-forget publish of a run-changed signal. Never throws. Publishes to + * the per-run channel (single-run feed) and, when environmentId is known, the + * per-env channel (tag/list feed). Payload is the runId so env consumers can + * tell which run moved. IDs only, never row data. + */ + publish(input: RunChangeInput): void { + this.#publishToChannel(this.#channelForRun(input.runId), input.runId); + if (input.environmentId) { + this.#publishToChannel(this.#channelForEnv(input.environmentId), input.runId); + } + } + + #publishToChannel(channel: string, payload: string): void { + try { + const publisher = this.#ensurePublisher(); + const result = publisher.publish(channel, payload); + if (typeof (result as Promise)?.catch === "function") { + (result as Promise).catch((error) => { + logger.debug("[runChangeNotifier] publish failed", { error, channel }); + }); + } + } catch (error) { + logger.debug("[runChangeNotifier] publish threw", { error, channel }); + } + } + + /** Fire-and-forget publish of many run-changed signals. Never throws. */ + publishMany(inputs: RunChangeInput[]): void { + for (const input of inputs) { + this.publish(input); + } + } + + /** + * Subscribe to the next change for a single run (single-run feed). + */ + subscribeToRunChanges(runId: string): RunChangeSubscription { + return this.#subscribe(this.#channelForRun(runId)); + } + + /** + * Subscribe to the next change of ANY run in an environment (tag/list feed). + * The consumer re-resolves its filter on each wake. + */ + subscribeToEnvChanges(environmentId: string): RunChangeSubscription { + return this.#subscribe(this.#channelForEnv(environmentId)); + } + + /** + * Refcounted subscribe over the shared subscriber, keyed by the full channel: + * the first listener for a channel issues SUBSCRIBE, the last one UNSUBSCRIBE. + */ + #subscribe(channel: string): RunChangeSubscription { + const subscriber = this.#ensureSubscriber(); + + let resolveChanged: () => void = () => {}; + const changed = new Promise((resolve) => { + resolveChanged = resolve; + }); + + let listeners = this.#listeners.get(channel); + if (!listeners) { + listeners = new Set(); + this.#listeners.set(channel, listeners); + subscriber.subscribe(channel).catch((error) => { + logger.debug("[runChangeNotifier] subscribe failed", { error, channel }); + }); + } + listeners.add(resolveChanged); + + let unsubscribed = false; + const unsubscribe = () => { + if (unsubscribed) { + return; + } + unsubscribed = true; + + const current = this.#listeners.get(channel); + if (!current) { + return; + } + current.delete(resolveChanged); + if (current.size === 0) { + // Drop the channel from the map only AFTER Redis confirms UNSUBSCRIBE, and + // only if no new listener re-subscribed while it was in flight. The map + // entry's existence mirrors "subscribed (or subscribe in flight) in Redis", + // so the subscribe path safely reuses it without a duplicate SUBSCRIBE. + subscriber + .unsubscribe(channel) + .then(() => { + const latest = this.#listeners.get(channel); + if (!latest) { + return; + } + if (latest.size === 0) { + this.#listeners.delete(channel); + } else { + // A listener arrived during the in-flight UNSUBSCRIBE; the channel is + // now unsubscribed in Redis but has live waiters. Re-subscribe so they + // still receive messages (the long-poll backstop covers the gap). + subscriber.subscribe(channel).catch((error) => { + logger.debug("[runChangeNotifier] resubscribe failed", { error, channel }); + }); + } + }) + .catch((error) => { + // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. + // Keep the (empty) map entry so a future subscriber reuses it without a + // duplicate SUBSCRIBE and #onMessage stays consistent with Redis state. + logger.debug("[runChangeNotifier] unsubscribe failed", { error, channel }); + }); + } + }; + + return { changed, unsubscribe }; + } + + /** Number of distinct channels currently subscribed (for metrics). */ + get activeSubscriptionCount(): number { + return this.#listeners.size; + } + + async quit(): Promise { + await Promise.allSettled([this.#subscriber?.quit(), this.#publisher?.quit()]); + this.#subscriber = undefined; + this.#publisher = undefined; + this.#listeners.clear(); + } + + #ensurePublisher(): RedisClient { + if (!this.#publisher) { + this.#publisher = createRedisClient(`${this.#connectionName}:pub`, this.options.redis); + } + return this.#publisher; + } + + #ensureSubscriber(): RedisClient { + if (!this.#subscriber) { + const subscriber = createRedisClient(`${this.#connectionName}:sub`, this.options.redis); + subscriber.on("message", (channel: string) => this.#onMessage(channel)); + this.#subscriber = subscriber; + } + return this.#subscriber; + } + + #onMessage(channel: string) { + const listeners = this.#listeners.get(channel); + if (!listeners) { + return; + } + // One-shot: each waiter resolves its race and removes itself via unsubscribe(). + for (const resolve of [...listeners]) { + resolve(); + } + } + + // Channels are hash-tagged (`...{}`) so a later move to sharded pub/sub + // keeps slot locality without a rename. + #channelForRun(runId: string): string { + return `${this.#channelPrefix}run:{${runId}}`; + } + + #channelForEnv(environmentId: string): string { + return `${this.#channelPrefix}env:{${environmentId}}`; + } +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts new file mode 100644 index 00000000000..791991178e4 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -0,0 +1,73 @@ +import { env } from "~/env.server"; +import { engine } from "~/v3/runEngine.server"; +import { logger } from "../logger.server"; +import { publishRunChanged } from "./runChangeNotifierInstance.server"; + +/** + * Registers the run-changed delegations as additive listeners on the Run Engine + * 2.0 event bus. All logic lives in the notifier + * module; each listener here is a one-line, fire-and-forget delegate. Because + * they only attach to engine events, they cover V2 runs exclusively (V1/MarQS + * never reach this engine), and they're trivially reversible (delete this file + + * its boot registration). + * + * Coverage is intentionally not exhaustive: a dropped or uncovered transition + * only adds latency because the consumer has a ~5s refetch backstop. We cover the + * high-value, env-cheap transitions here. + */ +export function registerRunChangeNotifierHandlers() { + if (env.REALTIME_NOTIFIER_ENABLED !== "1") { + return; + } + + // Status transitions (checkpoint suspend/resume, pending version, dequeue) — + // environment.id is in the payload. + engine.eventBus.on("runStatusChanged", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Dequeue/lock (sets startedAt) and attempt start (DEQUEUED -> EXECUTING) — the + // most-watched "my run started" transitions. + engine.eventBus.on("runLocked", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runAttemptStarted", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Terminal + failure transitions. + engine.eventBus.on("runSucceeded", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runFailed", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runExpired", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runCancelled", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runRetryScheduled", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Delay lifecycle (delayUntil / queued-after-delay changes). + engine.eventBus.on("runDelayRescheduled", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + engine.eventBus.on("runEnqueuedAfterDelay", ({ run, environment }) => { + publishRunChanged({ runId: run.id, environmentId: environment.id }); + }); + + // Attempt failures and metadata updates don't carry environmentId, but the + // single-run channel is keyed by runId alone. + engine.eventBus.on("runAttemptFailed", ({ run }) => { + publishRunChanged({ runId: run.id }); + }); + engine.eventBus.on("runMetadataUpdated", ({ run }) => { + publishRunChanged({ runId: run.id }); + }); + + logger.info("[runChangeNotifier] realtime run-change notifier handlers registered"); +} diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts new file mode 100644 index 00000000000..545887abc61 --- /dev/null +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -0,0 +1,73 @@ +import { Gauge } from "prom-client"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { singleton } from "~/utils/singleton"; +import { + RunChangeNotifier, + type RunChangeInput, + type RunChangeSubscription, +} from "./runChangeNotifier.server"; + +/** + * Process-singleton wiring for the RunChangeNotifier plus the thin, gated + * convenience functions that write sites and the realtime route delegate to. + * + * The notifier is constructed lazily (only on the first publish/subscribe when + * enabled), so a webapp running with `REALTIME_NOTIFIER_ENABLED=0` (the default) + * opens no Redis connections and registers no metrics for this subsystem. + */ +const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; + +function initializeRunChangeNotifier(): RunChangeNotifier { + const notifier = new RunChangeNotifier({ + redis: { + host: env.PUBSUB_REDIS_HOST, + port: env.PUBSUB_REDIS_PORT, + username: env.PUBSUB_REDIS_USERNAME, + password: env.PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode: env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + }); + + new Gauge({ + name: "realtime_run_change_notifier_active_subscriptions", + help: "Distinct runs currently subscribed for realtime change notifications", + collect() { + this.set(notifier.activeSubscriptionCount); + }, + registers: [metricsRegister], + }); + + return notifier; +} + +/** Lazily construct (and memoize) the notifier singleton. */ +export function getRunChangeNotifier(): RunChangeNotifier { + return singleton("runChangeNotifier", initializeRunChangeNotifier); +} + +/** Whether the notifier subsystem is enabled for this process. */ +export function isRunChangeNotifierEnabled(): boolean { + return notifierEnabled; +} + +/** Fire-and-forget run-changed notify. No-op (and no notifier construction) when disabled. */ +export function publishRunChanged(input: RunChangeInput): void { + if (!notifierEnabled) { + return; + } + getRunChangeNotifier().publish(input); +} + +export function publishManyRunChanged(inputs: RunChangeInput[]): void { + if (!notifierEnabled) { + return; + } + getRunChangeNotifier().publishMany(inputs); +} + +/** Subscribe to the next change for a run via the shared subscriber. */ +export function subscribeToRunChanges(runId: string): RunChangeSubscription { + return getRunChangeNotifier().subscribeToRunChanges(runId); +} diff --git a/apps/webapp/app/services/realtime/runReader.server.ts b/apps/webapp/app/services/realtime/runReader.server.ts new file mode 100644 index 00000000000..6fe59c3c059 --- /dev/null +++ b/apps/webapp/app/services/realtime/runReader.server.ts @@ -0,0 +1,191 @@ +import { type Prisma, type PrismaClient } from "@trigger.dev/database"; +import { BoundedTtlCache } from "./boundedTtlCache"; +import { type RealtimeRunRow } from "./electricStreamProtocol.server"; + +/** + * RunReader — the pluggable read half of the notifier-backed realtime feed. + * + * The mandate: ClickHouse is filter-only and resolves IDs, + * Postgres always hydrates row columns. This file owns the Postgres hydration + * half (`RunHydrator`, by-id) and the `RunListResolver` interface (the tag/list + * filter -> id-set seam, implemented over ClickHouse). + * + * Splitting hydration behind this small surface keeps the realtime feed + * decoupled from where runs physically live, ready for a future `TaskRunFast` + * table or a non-Postgres row store. + */ + +/** The TaskRun columns the realtime feed projects (mirrors DEFAULT_ELECTRIC_COLUMNS). */ +export const RUN_HYDRATOR_SELECT = { + id: true, + taskIdentifier: true, + createdAt: true, + updatedAt: true, + startedAt: true, + delayUntil: true, + queuedAt: true, + expiredAt: true, + completedAt: true, + friendlyId: true, + number: true, + isTest: true, + status: true, + usageDurationMs: true, + costInCents: true, + baseCostInCents: true, + ttl: true, + payload: true, + payloadType: true, + metadata: true, + metadataType: true, + output: true, + outputType: true, + runTags: true, + error: true, + realtimeStreams: true, +} satisfies Prisma.TaskRunSelect; + +/** + * Columns the feed needs internally regardless of the client's `skipColumns`: + * `id` keys the row, `updatedAt` drives the offset and the live working-set diff. + * Everything else can be projected away when the client skips it (see + * `buildHydratorSelect`), so the replica doesn't ship large `payload`/`output`/ + * `metadata`/`error` columns the response will drop anyway. + */ +const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt"]); + +/** Project `RUN_HYDRATOR_SELECT` down to the columns the client didn't skip (plus + * the always-needed ones). An empty skip set returns the full select unchanged. */ +export function buildHydratorSelect(skipColumns: string[] = []): Prisma.TaskRunSelect { + if (skipColumns.length === 0) { + return RUN_HYDRATOR_SELECT; + } + const skip = new Set(skipColumns); + const select: Record = {}; + for (const column of Object.keys(RUN_HYDRATOR_SELECT)) { + if (ALWAYS_HYDRATED_COLUMNS.has(column) || !skip.has(column)) { + select[column] = true; + } + } + return select as Prisma.TaskRunSelect; +} + +export type RunListFilter = { + organizationId: string; + projectId: string; + environmentId: string; + /** Contains-ANY tag match (OR). Omit/empty for non-tag feeds. */ + tags?: string[]; + /** Restrict to a single batch (internal batch id) — the batch feed. */ + batchId?: string; + /** Lower bound on createdAt (the tag-list feed pins this; batch omits it). */ + createdAtAfter?: Date; + /** Hard cap on the result set so a broad filter can't unbound the snapshot. */ + limit: number; +}; + +/** + * Resolves a tag/list filter into the matching run id-set, filter-only (no row + * columns; rows are hydrated from Postgres by id afterward). Pluggable so the + * resolution source can change without touching the feed. The ClickHouse + * implementation lives in `clickHouseRunListResolver.server.ts`. + */ +export interface RunListResolver { + resolveMatchingRunIds(filter: RunListFilter): Promise; +} + +export type RunHydratorOptions = { + /** A read-replica Prisma client (`$replica`). Always Postgres. */ + replica: Pick; + /** + * Read-through cache TTL (ms) to collapse duplicate refetches across a burst + * of live polls for the same run. Fan-in is low in practice, so this is + * insurance, not load-bearing. Set to 0 to disable. Defaults to 250ms. + */ + cacheTtlMs?: number; + /** Hard cap on cache entries before expired entries are swept. */ + maxCacheEntries?: number; +}; + +const DEFAULT_CACHE_TTL_MS = 250; +const DEFAULT_MAX_CACHE_ENTRIES = 5_000; + +/** + * Hydrates a single run by id from the read replica, projected to the realtime + * columns. Concurrent refetches for the same (env, run) are single-flighted, and + * a short TTL cache collapses rapid repeats. + */ +export class RunHydrator { + readonly #inflight = new Map>(); + readonly #cache: BoundedTtlCache; + readonly #cacheTtlMs: number; + + constructor(private readonly options: RunHydratorOptions) { + this.#cacheTtlMs = options.cacheTtlMs ?? DEFAULT_CACHE_TTL_MS; + this.#cache = new BoundedTtlCache( + this.#cacheTtlMs, + options.maxCacheEntries ?? DEFAULT_MAX_CACHE_ENTRIES + ); + } + + async getRunById(environmentId: string, runId: string): Promise { + const key = `${environmentId}:${runId}`; + + if (this.#cacheTtlMs > 0) { + // A cached null is a valid "run not found" hit; only undefined is a miss. + const cached = this.#cache.get(key); + if (cached !== undefined) { + return cached; + } + } + + const existing = this.#inflight.get(key); + if (existing) { + return existing; + } + + const promise = this.#fetch(environmentId, runId).finally(() => this.#inflight.delete(key)); + this.#inflight.set(key, promise); + + const row = await promise; + + if (this.#cacheTtlMs > 0) { + this.#cache.set(key, row); + } + + return row; + } + + /** Hydrate many runs by id in one query (tag/list feed). Order is not guaranteed. + * `skipColumns` projects the SELECT so the replica doesn't ship columns the client + * dropped (notably the large `payload`/`output`/`metadata`/`error` columns). */ + async hydrateByIds( + environmentId: string, + ids: string[], + skipColumns: string[] = [] + ): Promise { + if (ids.length === 0) { + return []; + } + const rows = await this.options.replica.taskRun.findMany({ + where: { + runtimeEnvironmentId: environmentId, + id: { in: ids }, + }, + select: buildHydratorSelect(skipColumns), + }); + return rows as unknown as RealtimeRunRow[]; + } + + async #fetch(environmentId: string, runId: string): Promise { + const run = await this.options.replica.taskRun.findFirst({ + where: { + id: runId, + runtimeEnvironmentId: environmentId, + }, + select: RUN_HYDRATOR_SELECT, + }); + + return (run ?? null) as RealtimeRunRow | null; + } +} diff --git a/apps/webapp/app/services/realtime/shadowCompare.server.ts b/apps/webapp/app/services/realtime/shadowCompare.server.ts new file mode 100644 index 00000000000..9a30d93c4da --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowCompare.server.ts @@ -0,0 +1,289 @@ +import { + type ElectricColumnType, + RUN_ELECTRIC_COLUMNS, + serializeRunRow, +} from "./electricStreamProtocol.server"; +import { type RunHydrator, type RunListFilter, type RunListResolver } from "./runReader.server"; + +/** + * Dual-run shadow-compare. + * + * The client is always served the Electric response; in the background this + * re-derives what the notifier path WOULD emit and diffs the two, so we can prove + * parity on real production traffic before any cutover. + * + * Two kinds of divergence are checked: + * - serialization: for each run Electric emitted, re-hydrate it and serialize via + * the notifier serializer, then compare SEMANTICALLY (decode both sides per + * column type) so equivalent-but-differently-encoded wire values (timestamp + * format, bool t/true, number formatting) are not false positives. The compare + * is gated on same-version (matching updatedAt) so a row that changed between + * Electric's emit and our refetch is recorded as "skew", not a divergence. + * - membership (tag/batch initial snapshot only): the set of run ids Electric + * emitted vs the set the notifier resolver returns. This is where the known + * tag OR-vs-AND difference shows up. + * + * Pure except for the injected RunHydrator/RunListResolver, so it's unit-testable. + */ + +export type ShadowFeed = "run" | "runs" | "batch"; + +type WireValue = Record; + +type ShapeMessage = { + key?: string; + value?: WireValue; + headers: { operation?: string; control?: string }; +}; + +const COLUMN_BY_NAME = new Map(RUN_ELECTRIC_COLUMNS.map((column) => [column.name, column])); + +export type ColumnDiff = { + runId: string; + column: string; + electric: string | null; + notifier: string | null; +}; + +export type ShadowCompareOutcome = { + feed: ShadowFeed; + /** Runs whose every emitted column matched (same-version). */ + serializationMatched: number; + /** Runs with at least one semantic column divergence (same-version). */ + serializationDiverged: number; + /** Runs that changed between Electric's emit and our refetch (not a divergence). */ + serializationSkew: number; + /** Per-column divergences (capped) for logging. */ + diffs: ColumnDiff[]; + /** Set membership (tag/batch initial snapshot only). undefined when not checked. */ + membershipMatch?: boolean; + missingInNotifier?: string[]; + extraInNotifier?: string[]; +}; + +export type ShadowCompareInput = { + feed: ShadowFeed; + /** The served Electric response body (a JSON array of messages, or "" / "[]"). */ + electricBody: string; + environment: { id: string }; + skipColumns: string[]; + /** True when this was an initial snapshot request (offset=-1); enables membership compare. */ + isInitialSnapshot: boolean; + /** When set (tag/batch initial snapshot), compare the resolved id-set. */ + membershipFilter?: RunListFilter; +}; + +const MAX_DIFFS = 20; + +export class RealtimeShadowComparator { + constructor( + private readonly options: { runReader: RunHydrator; runListResolver: RunListResolver } + ) {} + + async compare(input: ShadowCompareInput): Promise { + const messages = parseBody(input.electricBody); + const changes = messages.filter( + (m): m is ShapeMessage & { value: WireValue } => + typeof m.headers?.operation === "string" && !!m.value && m.headers.operation !== "delete" + ); + + const outcome: ShadowCompareOutcome = { + feed: input.feed, + serializationMatched: 0, + serializationDiverged: 0, + serializationSkew: 0, + diffs: [], + }; + + for (const message of changes) { + const runId = message.value.id ?? undefined; + if (!runId) { + continue; + } + + const row = await this.options.runReader.getRunById(input.environment.id, runId); + if (!row) { + // Run no longer readable (deleted / replica miss). Not a serialization divergence. + outcome.serializationSkew++; + continue; + } + + const notifierValue = serializeRunRow(row, input.skipColumns); + + // Only compare rows at the same version; otherwise the row advanced between + // Electric's emit and our refetch (timing skew, not a divergence). + if (!sameInstant(message.value.updatedAt, notifierValue.updatedAt)) { + outcome.serializationSkew++; + continue; + } + + let rowDiverged = false; + for (const [column, electricRaw] of Object.entries(message.value)) { + const meta = COLUMN_BY_NAME.get(column); + if (!meta) { + continue; + } + const notifierRaw = notifierValue[column] ?? null; + if (!valuesEqual(electricRaw, notifierRaw, meta.type, meta.dims, column)) { + rowDiverged = true; + if (outcome.diffs.length < MAX_DIFFS) { + outcome.diffs.push({ runId, column, electric: electricRaw, notifier: notifierRaw }); + } + } + } + + if (rowDiverged) { + outcome.serializationDiverged++; + } else { + outcome.serializationMatched++; + } + } + + if (input.isInitialSnapshot && input.membershipFilter) { + const electricIds = new Set( + changes.map((m) => m.value.id).filter((id): id is string => typeof id === "string") + ); + const notifierIds = new Set( + await this.options.runListResolver.resolveMatchingRunIds(input.membershipFilter) + ); + + outcome.missingInNotifier = [...electricIds].filter((id) => !notifierIds.has(id)); + outcome.extraInNotifier = [...notifierIds].filter((id) => !electricIds.has(id)); + outcome.membershipMatch = + outcome.missingInNotifier.length === 0 && outcome.extraInNotifier.length === 0; + } + + return outcome; + } +} + +function parseBody(body: string): ShapeMessage[] { + const text = body.trim(); + if (!text) { + return []; + } + try { + const parsed = JSON.parse(text); + return Array.isArray(parsed) ? (parsed as ShapeMessage[]) : []; + } catch { + return []; + } +} + +/** Status carries a known legacy rewrite (DEQUEUED -> EXECUTING) applied equally to + * both paths for non-current API versions; treat them as equivalent. */ +function normalizeStatus(value: string): string { + return value === "DEQUEUED" ? "EXECUTING" : value; +} + +function sameInstant(a: string | null | undefined, b: string | null | undefined): boolean { + if (a == null || b == null) { + return a == null && b == null; + } + // Mirror the SDK's RawShapeDate (`new Date(val + "Z")`). + return new Date(`${a}Z`).getTime() === new Date(`${b}Z`).getTime(); +} + +function valuesEqual( + electricRaw: string | null, + notifierRaw: string | null, + type: ElectricColumnType, + dims: number | undefined, + column: string +): boolean { + if (electricRaw == null || notifierRaw == null) { + return electricRaw == null && notifierRaw == null; + } + + if (dims && dims > 0) { + return arraysEqual(parsePgTextArray(electricRaw), parsePgTextArray(notifierRaw)); + } + + switch (type) { + case "timestamp": + return new Date(`${electricRaw}Z`).getTime() === new Date(`${notifierRaw}Z`).getTime(); + case "bool": + return parseBool(electricRaw) === parseBool(notifierRaw); + case "int4": + case "int8": + case "float8": + return Number(electricRaw) === Number(notifierRaw); + case "jsonb": + return jsonEqual(electricRaw, notifierRaw); + case "text": + default: + if (column === "status") { + return normalizeStatus(electricRaw) === normalizeStatus(notifierRaw); + } + return electricRaw === notifierRaw; + } +} + +function parseBool(value: string): boolean { + return value === "t" || value === "true"; +} + +function jsonEqual(a: string, b: string): boolean { + try { + return deepEqual(JSON.parse(a), JSON.parse(b)); + } catch { + return a === b; + } +} + +function deepEqual(a: unknown, b: unknown): boolean { + if (a === b) return true; + if (typeof a !== typeof b || a === null || b === null) return false; + if (Array.isArray(a) && Array.isArray(b)) { + return a.length === b.length && a.every((v, i) => deepEqual(v, b[i])); + } + if (typeof a === "object" && typeof b === "object") { + const ak = Object.keys(a as object).sort(); + const bk = Object.keys(b as object).sort(); + return ( + ak.length === bk.length && + ak.every((k, i) => k === bk[i]) && + ak.every((k) => deepEqual((a as any)[k], (b as any)[k])) + ); + } + return false; +} + +function arraysEqual(a: string[], b: string[]): boolean { + return a.length === b.length && a.every((v, i) => v === b[i]); +} + +/** Parse a Postgres text-array literal (`{"a","b"}` / `{}`). Mirrors the client's pgArrayParser. */ +function parsePgTextArray(literal: string): string[] { + if (literal === "{}" || literal === "") { + return []; + } + const inner = literal.startsWith("{") && literal.endsWith("}") ? literal.slice(1, -1) : literal; + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts new file mode 100644 index 00000000000..1ddf162fd87 --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts @@ -0,0 +1,192 @@ +import { API_VERSIONS } from "~/api/versions"; +import { logger } from "../logger.server"; +import { + type RealtimeEnvironment, + type RealtimeRequestOptions, + type RealtimeRunsParams, +} from "../realtimeClient.server"; +import { RESERVED_COLUMNS } from "./electricStreamProtocol.server"; +import { + type RealtimeListEnvironment, + type RealtimeStreamClient, +} from "./notifierRealtimeClient.server"; +import { type RunListFilter } from "./runReader.server"; +import { + type RealtimeShadowComparator, + type ShadowCompareOutcome, + type ShadowFeed, +} from "./shadowCompare.server"; + +export type ShadowRealtimeClientOptions = { + /** The path actually served to the client (Electric). */ + electric: RealtimeStreamClient; + comparator: RealtimeShadowComparator; + /** createdAt window (ms) used to resolve tag-list membership for the compare. */ + maximumCreatedAtFilterAgeMs: number; + /** Cap for the membership resolve. */ + maxListResults: number; + /** Metrics sink for compare outcomes. */ + onOutcome?: (outcome: ShadowCompareOutcome) => void; +}; + +/** + * Dual-run gate: a transparent wrapper that serves the Electric + * response unchanged and, in the background, diffs what the notifier path would emit + * against it. The shadow work is fire-and-forget — it never blocks or fails the + * client's request — and it exercises the read replica so the notifier's real load + * can be measured before cutover. + */ +export class ShadowRealtimeClient implements RealtimeStreamClient { + constructor(private readonly options: ShadowRealtimeClientOptions) {} + + async streamRun( + url: URL | string, + environment: RealtimeEnvironment, + runId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRun( + url, + environment, + runId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("run", response, url, environment, requestOptions); + return response; + } + + async streamRuns( + url: URL | string, + environment: RealtimeListEnvironment, + params: RealtimeRunsParams, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamRuns( + url, + environment, + params, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("runs", response, url, environment, requestOptions, { tags: params.tags ?? [] }); + return response; + } + + async streamBatch( + url: URL | string, + environment: RealtimeListEnvironment, + batchId: string, + apiVersion: API_VERSIONS, + requestOptions?: RealtimeRequestOptions, + clientVersion?: string, + signal?: AbortSignal + ): Promise { + const response = await this.options.electric.streamBatch( + url, + environment, + batchId, + apiVersion, + requestOptions, + clientVersion, + signal + ); + this.#shadow("batch", response, url, environment, requestOptions, { batchId }); + return response; + } + + /** Fire-and-forget; never blocks the served response, never throws into the request. */ + #shadow( + feed: ShadowFeed, + electricResponse: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions?: RealtimeRequestOptions, + membership?: { tags?: string[]; batchId?: string } + ): void { + // Clone synchronously before the client consumes the body. + let bodyClone: Response; + try { + if (electricResponse.status !== 200) { + return; + } + bodyClone = electricResponse.clone(); + } catch { + return; + } + + void this.#runShadow(feed, bodyClone, url, environment, requestOptions, membership).catch( + (error) => logger.debug("[shadowRealtime] compare failed", { feed, error }) + ); + } + + async #runShadow( + feed: ShadowFeed, + bodyClone: Response, + url: URL | string, + environment: RealtimeEnvironment & { projectId?: string }, + requestOptions: RealtimeRequestOptions | undefined, + membership: { tags?: string[]; batchId?: string } | undefined + ): Promise { + const $url = new URL(url.toString()); + const offset = $url.searchParams.get("offset") ?? "-1"; + const handle = $url.searchParams.get("handle") ?? $url.searchParams.get("shape_id"); + const isInitialSnapshot = offset === "-1" || !handle; + const skipColumns = resolveSkipColumns($url, requestOptions); + const electricBody = await bodyClone.text(); + + let membershipFilter: RunListFilter | undefined; + if (isInitialSnapshot && membership && environment.projectId) { + membershipFilter = { + organizationId: environment.organizationId, + projectId: environment.projectId, + environmentId: environment.id, + tags: membership.tags, + batchId: membership.batchId, + createdAtAfter: membership.batchId + ? undefined + : new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs), + limit: this.options.maxListResults, + }; + } + + const outcome = await this.options.comparator.compare({ + feed, + electricBody, + environment: { id: environment.id }, + skipColumns, + isInitialSnapshot, + membershipFilter, + }); + + this.options.onOutcome?.(outcome); + + if (outcome.serializationDiverged > 0 || outcome.membershipMatch === false) { + logger.warn("[shadowRealtime] divergence detected", { + feed, + serializationDiverged: outcome.serializationDiverged, + serializationMatched: outcome.serializationMatched, + serializationSkew: outcome.serializationSkew, + membershipMatch: outcome.membershipMatch, + missingInNotifier: outcome.missingInNotifier?.slice(0, 20), + extraInNotifier: outcome.extraInNotifier?.slice(0, 20), + diffs: outcome.diffs, + }); + } + } +} + +function resolveSkipColumns(url: URL, requestOptions?: RealtimeRequestOptions): string[] { + const raw = requestOptions?.skipColumns ?? url.searchParams.get("skipColumns")?.split(",") ?? []; + return raw.map((c) => c.trim()).filter((c) => c !== "" && !RESERVED_COLUMNS.includes(c)); +} diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts new file mode 100644 index 00000000000..36ce0a4325b --- /dev/null +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -0,0 +1,66 @@ +import { Counter } from "prom-client"; +import { $replica } from "~/db.server"; +import { env } from "~/env.server"; +import { metricsRegister } from "~/metrics.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { singleton } from "~/utils/singleton"; +import { realtimeClient } from "../realtimeClientGlobal.server"; +import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { RunHydrator } from "./runReader.server"; +import { RealtimeShadowComparator } from "./shadowCompare.server"; +import { ShadowRealtimeClient } from "./shadowRealtimeClient.server"; + +/** + * Process-singleton wiring for the shadow-compare client. Only constructed + * when an org's `realtimeBackend` flag is set to "shadow". + */ +function initializeShadowRealtimeClient(): ShadowRealtimeClient { + const compares = new Counter({ + name: "realtime_shadow_compare_total", + help: "Dual-run shadow-compare outcomes (Electric vs notifier). kind=serialization|membership, result=match|diverge|skew.", + labelNames: ["feed", "kind", "result"] as const, + registers: [metricsRegister], + }); + + const comparator = new RealtimeShadowComparator({ + runReader: new RunHydrator({ replica: $replica }), + runListResolver: new ClickHouseRunListResolver({ + getClickhouse: (organizationId) => + clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + prisma: $replica, + }), + }); + + return new ShadowRealtimeClient({ + electric: realtimeClient, + comparator, + maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, + maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, + onOutcome: (outcome) => { + const { feed } = outcome; + if (outcome.serializationMatched) { + compares.inc({ feed, kind: "serialization", result: "match" }, outcome.serializationMatched); + } + if (outcome.serializationDiverged) { + compares.inc( + { feed, kind: "serialization", result: "diverge" }, + outcome.serializationDiverged + ); + } + if (outcome.serializationSkew) { + compares.inc({ feed, kind: "serialization", result: "skew" }, outcome.serializationSkew); + } + if (outcome.membershipMatch !== undefined) { + compares.inc({ + feed, + kind: "membership", + result: outcome.membershipMatch ? "match" : "diverge", + }); + } + }, + }); +} + +export function getShadowRealtimeClient(): ShadowRealtimeClient { + return singleton("shadowRealtimeClient", initializeShadowRealtimeClient); +} diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 9a5d75cfe25..55b30a8396e 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -10,6 +10,7 @@ export const FEATURE_FLAG = { hasPrivateConnections: "hasPrivateConnections", mollifierEnabled: "mollifierEnabled", workerQueueScheduledSplitEnabled: "workerQueueScheduledSplitEnabled", + realtimeBackend: "realtimeBackend", } as const; export const FeatureFlagCatalog = { @@ -22,6 +23,10 @@ export const FeatureFlagCatalog = { [FEATURE_FLAG.hasPrivateConnections]: z.coerce.boolean(), [FEATURE_FLAG.mollifierEnabled]: z.coerce.boolean(), [FEATURE_FLAG.workerQueueScheduledSplitEnabled]: z.coerce.boolean(), + // Which backend serves the realtime run feed. Controllable + // globally and per-org (org wins). Defaults to "electric" when unset. + // "shadow" serves Electric but diffs the notifier path in the background. + [FEATURE_FLAG.realtimeBackend]: z.enum(["electric", "notifier", "shadow"]), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/test/realtime/boundedTtlCache.test.ts b/apps/webapp/test/realtime/boundedTtlCache.test.ts new file mode 100644 index 00000000000..e487798750e --- /dev/null +++ b/apps/webapp/test/realtime/boundedTtlCache.test.ts @@ -0,0 +1,41 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { BoundedTtlCache } from "~/services/realtime/boundedTtlCache"; + +describe("BoundedTtlCache", () => { + afterEach(() => { + vi.useRealTimers(); + }); + + it("returns a live entry within its TTL", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("k", "v"); + vi.advanceTimersByTime(500); + expect(cache.get("k")).toBe("v"); + expect(cache.size).toBe(1); + }); + + it("evicts an expired entry on read instead of letting it linger", () => { + vi.useFakeTimers(); + const cache = new BoundedTtlCache(1_000, 100); + cache.set("a", 1); + expect(cache.size).toBe(1); + + vi.advanceTimersByTime(1_001); + expect(cache.get("a")).toBeUndefined(); + // The previous bug left expired entries in the map until an at-capacity sweep; + // they must now be removed on read. + expect(cache.size).toBe(0); + }); + + it("drops the oldest entry when full of still-live entries", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + cache.set("c", 3); // over capacity, none expired -> evict oldest insertion (a) + expect(cache.get("a")).toBeUndefined(); + expect(cache.get("b")).toBe(2); + expect(cache.get("c")).toBe(3); + expect(cache.size).toBe(2); + }); +}); diff --git a/apps/webapp/test/realtime/electricStreamProtocol.test.ts b/apps/webapp/test/realtime/electricStreamProtocol.test.ts new file mode 100644 index 00000000000..a48f4f9f8e8 --- /dev/null +++ b/apps/webapp/test/realtime/electricStreamProtocol.test.ts @@ -0,0 +1,304 @@ +import { SubscribeRunRawShape } from "@trigger.dev/core/v3/schemas"; +import { describe, expect, it } from "vitest"; +import { + buildElectricSchemaHeader, + buildRowsBody, + buildSnapshotBody, + buildUpdateBody, + buildUpToDateBody, + encodeOffset, + parseOffsetUpdatedAtMs, + type RealtimeRunRow, + rewriteBodyForLegacyApiVersion, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_abc123", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-06T10:00:00.000Z"), + updatedAt: new Date("2026-06-06T10:05:30.123Z"), + startedAt: new Date("2026-06-06T10:01:00.000Z"), + delayUntil: null, + queuedAt: new Date("2026-06-06T10:00:30.000Z"), + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_abc", + number: 42, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: '{"step":1}', + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["user:123", "env:prod"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +/** + * Faithful re-implementation of the @electric-sql/client value parser rules + * (defaultParser + pgArrayParser), so we can decode our wire `value` object the + * same way the deployed client would, then validate against the real SDK schema. + * Source: @electric-sql/client@1.0.14 src/parser.ts. + */ +function electricParse( + value: Record, + schema: Record +): Record { + const out: Record = {}; + for (const [key, raw] of Object.entries(value)) { + if (raw === null) { + out[key] = null; + continue; + } + const info = schema[key]; + if (!info) { + out[key] = raw; + continue; + } + if (info.dims && info.dims > 0) { + out[key] = parsePgTextArray(raw); + continue; + } + switch (info.type) { + case "bool": + out[key] = raw === "t" || raw === "true"; + break; + case "int8": + out[key] = BigInt(raw); + break; + case "int2": + case "int4": + case "float4": + case "float8": + out[key] = Number(raw); + break; + case "json": + case "jsonb": + out[key] = JSON.parse(raw); + break; + default: + out[key] = raw; // text/timestamp pass through as strings + } + } + return out; +} + +function parsePgTextArray(literal: string): string[] { + if (literal === "{}") { + return []; + } + const inner = literal.slice(1, -1); + const result: string[] = []; + let i = 0; + while (i < inner.length) { + if (inner[i] === '"') { + i++; + let s = ""; + while (i < inner.length && inner[i] !== '"') { + if (inner[i] === "\\") { + i++; + } + s += inner[i]; + i++; + } + result.push(s); + i++; // closing quote + if (inner[i] === ",") i++; + } else { + let s = ""; + while (i < inner.length && inner[i] !== ",") { + s += inner[i]; + i++; + } + result.push(s); + if (inner[i] === ",") i++; + } + } + return result; +} + +describe("electricStreamProtocol serializer", () => { + it("encodes each Postgres type the way the Electric client expects", () => { + const value = serializeRunRow(sampleRow()); + + // text: passed through as-is + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + expect(value.payload).toBe('{"hello":"world"}'); + + // int/float: stringified + expect(value.number).toBe("42"); + expect(value.usageDurationMs).toBe("1234"); + expect(value.costInCents).toBe("0.55"); + + // bool: postgres "t"/"f" + expect(value.isTest).toBe("t"); + + // timestamp: ISO without trailing Z (the SDK appends Z before parsing) + expect(value.updatedAt).toBe("2026-06-06T10:05:30.123"); + expect(value.createdAt).toBe("2026-06-06T10:00:00.000"); + + // nullable timestamp: null stays null + expect(value.delayUntil).toBeNull(); + expect(value.completedAt).toBeNull(); + + // text[]: quoted pg array literal; empty realtimeStreams (@default([])) => {} + expect(value.runTags).toBe('{"user:123","env:prod"}'); + expect(value.realtimeStreams).toBe("{}"); + + // jsonb: null stays null + expect(value.error).toBeNull(); + }); + + it("encodes an empty no-default array column (runTags) as null, matching Electric", () => { + // runTags has no Postgres default, so an empty value is stored as SQL NULL and + // Electric emits `null` (not `{}`). realtimeStreams has @default([]), so its + // empty value is `{}`. Prisma hands us `[]` for both; we re-derive the wire form. + const value = serializeRunRow(sampleRow({ runTags: [], realtimeStreams: [] })); + expect(value.runTags).toBeNull(); + expect(value.realtimeStreams).toBe("{}"); + }); + + it("encodes jsonb error as a JSON string", () => { + const value = serializeRunRow(sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } })); + expect(value.error).toBe('{"type":"STRING_ERROR","raw":"boom"}'); + }); + + it("round-trips through the client parser into a valid SubscribeRunRawShape", () => { + const row = sampleRow({ error: { type: "STRING_ERROR", raw: "boom" } }); + const value = serializeRunRow(row); + const schema = JSON.parse(buildElectricSchemaHeader()); + + const decoded = electricParse(value, schema); + const parsed = SubscribeRunRawShape.parse(decoded); + + expect(parsed.id).toBe("run_abc123"); + expect(parsed.friendlyId).toBe("run_friendly_abc"); + expect(parsed.status).toBe("EXECUTING"); + expect(parsed.number).toBe(42); + expect(parsed.isTest).toBe(true); + expect(parsed.usageDurationMs).toBe(1234); + expect(parsed.costInCents).toBeCloseTo(0.55); + expect(parsed.runTags).toEqual(["user:123", "env:prod"]); + expect(parsed.realtimeStreams).toEqual([]); + // RawShapeDate appends "Z" and coerces to a Date equal to the source instant. + expect(parsed.createdAt.toISOString()).toBe("2026-06-06T10:00:00.000Z"); + expect(parsed.updatedAt.toISOString()).toBe("2026-06-06T10:05:30.123Z"); + expect(parsed.startedAt?.toISOString()).toBe("2026-06-06T10:01:00.000Z"); + expect(parsed.delayUntil ?? null).toBeNull(); + expect(parsed.error).toEqual({ type: "STRING_ERROR", raw: "boom" }); + }); + + it("honors skipColumns (but never the reserved columns)", () => { + const value = serializeRunRow(sampleRow(), ["payload", "output", "id", "status"]); + expect(value.payload).toBeUndefined(); + expect(value.output).toBeUndefined(); + // reserved columns can't be skipped + expect(value.id).toBe("run_abc123"); + expect(value.status).toBe("EXECUTING"); + + const schema = JSON.parse(buildElectricSchemaHeader(["payload"])); + expect(schema.payload).toBeUndefined(); + expect(schema.status).toBeDefined(); + }); +}); + +describe("electricStreamProtocol message bodies", () => { + it("emits insert + up-to-date for an initial snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(sampleRow())); + expect(messages).toHaveLength(2); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_abc123"'); + expect(messages[0].value.status).toBe("EXECUTING"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty (missing) run snapshot", () => { + const messages = JSON.parse(buildSnapshotBody(null)); + expect(messages).toHaveLength(1); + expect(messages[0].headers.control).toBe("up-to-date"); + }); + + it("emits update + up-to-date for a live change", () => { + const messages = JSON.parse(buildUpdateBody(sampleRow())); + expect(messages[0].headers.operation).toBe("update"); + expect(messages[1].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date when nothing advanced", () => { + const messages = JSON.parse(buildUpToDateBody()); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("uses the same merge key across insert and update so the client merges by row", () => { + const insert = JSON.parse(buildSnapshotBody(sampleRow()))[0]; + const update = JSON.parse(buildUpdateBody(sampleRow()))[0]; + expect(insert.key).toBe(update.key); + }); +}); + +describe("electricStreamProtocol multi-row (tag-list) bodies", () => { + it("emits one change message per row with per-row operation, then up-to-date", () => { + const a = sampleRow({ id: "run_a" }); + const b = sampleRow({ id: "run_b", status: "QUEUED" }); + const messages = JSON.parse( + buildRowsBody([ + { row: a, operation: "insert" }, + { row: b, operation: "update" }, + ]) + ); + expect(messages).toHaveLength(3); + expect(messages[0].headers.operation).toBe("insert"); + expect(messages[0].key).toBe('"public"."TaskRun"/"run_a"'); + expect(messages[1].headers.operation).toBe("update"); + expect(messages[1].key).toBe('"public"."TaskRun"/"run_b"'); + expect(messages[1].value.status).toBe("QUEUED"); + expect(messages[2].headers.control).toBe("up-to-date"); + }); + + it("emits a bare up-to-date for an empty change set", () => { + const messages = JSON.parse(buildRowsBody([])); + expect(messages).toEqual([{ headers: { control: "up-to-date" } }]); + }); + + it("honors skipColumns across all rows", () => { + const messages = JSON.parse( + buildRowsBody([{ row: sampleRow(), operation: "insert" }], ["payload"]) + ); + expect(messages[0].value.payload).toBeUndefined(); + expect(messages[0].value.status).toBe("EXECUTING"); + }); +}); + +describe("electricStreamProtocol tokens + legacy rewrite", () => { + it("encodes and parses the offset updatedAt segment", () => { + const offset = encodeOffset(1717667130123, 7); + expect(offset).toBe("1717667130123_7"); + expect(parseOffsetUpdatedAtMs(offset)).toBe(1717667130123); + }); + + it("treats the initial offset (-1) and garbage as zero", () => { + expect(parseOffsetUpdatedAtMs("-1")).toBe(0); + expect(parseOffsetUpdatedAtMs(null)).toBe(0); + expect(parseOffsetUpdatedAtMs("nonsense")).toBe(0); + }); + + it("rewrites DEQUEUED to EXECUTING for legacy API versions", () => { + const body = buildUpdateBody(sampleRow({ status: "DEQUEUED" })); + expect(body).toContain('"status":"DEQUEUED"'); + const rewritten = rewriteBodyForLegacyApiVersion(body); + expect(rewritten).not.toContain('"status":"DEQUEUED"'); + expect(rewritten).toContain('"status":"EXECUTING"'); + }); +}); diff --git a/apps/webapp/test/realtime/notifierRealtimeClient.test.ts b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts new file mode 100644 index 00000000000..fb3349e0e62 --- /dev/null +++ b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts @@ -0,0 +1,107 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(): RealtimeRunRow { + return { + id: "run_1", + taskIdentifier: "t", + createdAt: new Date("2026-06-07T10:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:01.000Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_1", + number: 1, + isTest: false, + status: "EXECUTING", + usageDurationMs: 0, + costInCents: 0, + baseCostInCents: 0, + ttl: null, + payload: "{}", + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: [], + error: null, + realtimeStreams: [], + }; +} + +// Only the initial-snapshot path is exercised here, which touches the shared +// #buildResponse — enough to lock the response-header contract. +function makeClient(row: RealtimeRunRow | null) { + const never = { changed: new Promise(() => {}), unsubscribe() {} }; + return new NotifierRealtimeClient({ + runReader: { + getRunById: async () => row, + hydrateByIds: async () => (row ? [row] : []), + } as any, + runListResolver: { resolveMatchingRunIds: async () => [] } as any, + notifier: { + subscribeToRunChanges: () => never, + subscribeToEnvChanges: () => never, + } as any, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + }); +} + +const ENV: RealtimeListEnvironment = { + id: "env_1", + organizationId: "org_1", + projectId: "proj_1", +}; + +describe("NotifierRealtimeClient response headers", () => { + it("exposes electric headers cross-origin so browser hooks can read them", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + "1.0.0-beta.1" // modern client => lowercase electric-* headers + ); + + // Without these the deployed @electric-sql/client throws MissingHeadersError + // (it can't read the electric-* headers across origins). This regressed once. + expect(res.headers.get("access-control-allow-origin")).toBe("*"); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + + // Initial (non-live) snapshot requires offset + handle + schema. + expect(res.headers.get("electric-offset")).toBeTruthy(); + expect(res.headers.get("electric-handle")).toBeTruthy(); + expect(res.headers.get("electric-schema")).toBeTruthy(); + expect(res.headers.get("content-type")).toBe("application/json"); + }); + + it("renames headers for legacy (0.4.0) clients", async () => { + const client = makeClient(sampleRow()); + const res = await client.streamRun( + "http://localhost:3030/realtime/v1/runs/run_1?offset=-1", + ENV, + "run_1", + CURRENT_API_VERSION, + undefined, + undefined // no client version => legacy header names + ); + + expect(res.headers.get("electric-chunk-last-offset")).toBeTruthy(); + expect(res.headers.get("electric-shape-id")).toBeTruthy(); + expect(res.headers.get("electric-offset")).toBeNull(); + expect(res.headers.get("electric-handle")).toBeNull(); + expect(res.headers.get("access-control-expose-headers")).toBe("*"); + }); +}); diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts new file mode 100644 index 00000000000..a0beb0fd728 --- /dev/null +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -0,0 +1,173 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { describe, expect, it, vi } from "vitest"; + +const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + +function row(id: string): RealtimeRunRow { + // Only id/createdAt/updatedAt are read directly; the rest serialize to null. + return { + id, + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:00:00.000Z"), + } as unknown as RealtimeRunRow; +} + +function makeClient(overrides: Record = {}) { + const resolveSpy = vi.fn(async () => ["run_1", "run_2"]); + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); + const never = { changed: new Promise(() => {}), unsubscribe() {} }; + + const client = new NotifierRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolveSpy } as any, + notifier: { subscribeToRunChanges: () => never, subscribeToEnvChanges: () => never } as any, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 5_000, + ...overrides, + }); + + return { client, resolveSpy, hydrateSpy }; +} + +// streamBatch with offset=-1 takes the snapshot path, which calls the coalescing +// resolve+hydrate directly (no concurrency slot / subscription needed). +function snapshot(client: NotifierRealtimeClient, batchId: string, skipColumns?: string) { + const skip = skipColumns ? `&skipColumns=${skipColumns}` : ""; + return client.streamBatch( + `http://localhost:3030/realtime/v1/batches/${batchId}?offset=-1${skip}`, + ENV, + batchId, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +// Tag-list snapshot (offset=-1) — exercises the createdAt bucketing + cache key. +function snapshotTag(client: NotifierRealtimeClient, tags: string[]) { + return client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=-1", + ENV, + { tags }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +describe("NotifierRealtimeClient run-set resolve coalescing + cache", () => { + it("coalesces concurrent same-filter resolves into one ClickHouse + Postgres query", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + let release!: (ids: string[]) => void; + const gate = new Promise((resolve) => { + release = resolve; + }); + resolveSpy.mockReturnValueOnce(gate); + + const p1 = snapshot(client, "batch_1"); + const p2 = snapshot(client, "batch_1"); + release(["run_1"]); + await Promise.all([p1, p2]); + + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("serves a second same-filter request from the cache within the TTL", async () => { + const { client, resolveSpy, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share the cache across different filters", async () => { + const { client, resolveSpy } = makeClient(); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_2"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("re-queries after the cache TTL expires", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + try { + const { client, resolveSpy } = makeClient({ runSetResolveCacheTtlMs: 1_000 }); + await snapshot(client, "batch_1"); + vi.advanceTimersByTime(1_001); + await snapshot(client, "batch_1"); + expect(resolveSpy).toHaveBeenCalledTimes(2); + } finally { + vi.useRealTimers(); + } + }); + + it("passes the client's skipColumns through to the hydrator (column projection)", async () => { + const { client, hydrateSpy } = makeClient(); + await snapshot(client, "batch_1", "payload,output"); + expect(hydrateSpy).toHaveBeenCalledWith("env_1", expect.any(Array), ["payload", "output"]); + }); + + it("reports resolve outcomes (miss then hit) to the metrics hook", async () => { + const results: string[] = []; + const { client } = makeClient({ onRunSetResolve: (r: string) => results.push(r) }); + await snapshot(client, "batch_1"); + await snapshot(client, "batch_1"); + expect(results).toEqual(["miss", "hit"]); + }); +}); + +describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { + it("floors the resolved createdAt lower bound to the bucket boundary", async () => { + // Fix the clock to a non-bucket-aligned instant so the assertion is deterministic. + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60_000 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + expect(passed.getTime() % 60_000).toBe(0); + } finally { + vi.useRealTimers(); + } + }); + + it("lets two same-tag feeds in the same bucket share one resolve", async () => { + // A large bucket guarantees both windows floor to the same boundary regardless of + // the sub-millisecond gap between the two calls. + const { client, resolveSpy, hydrateSpy } = makeClient({ + runSetCreatedAtBucketMs: 60 * 60_000, + }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["critical"]); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + }); + + it("does not share across different tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["critical"]); + await snapshotTag(client, ["debug"]); + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + + it("keeps each feed's exact lower bound when bucketing is disabled (0)", async () => { + vi.useFakeTimers({ toFake: ["Date"] }); + vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); + try { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 0 }); + await snapshotTag(client, ["critical"]); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Exact (now - 24h) lower bound, not floored to a 60s boundary. + expect(passed.getTime() % 60_000).not.toBe(0); + } finally { + vi.useRealTimers(); + } + }); +}); diff --git a/apps/webapp/test/realtime/runChangeNotifier.test.ts b/apps/webapp/test/realtime/runChangeNotifier.test.ts new file mode 100644 index 00000000000..7459c9f5df5 --- /dev/null +++ b/apps/webapp/test/realtime/runChangeNotifier.test.ts @@ -0,0 +1,211 @@ +import { redisTest } from "@internal/testcontainers"; +import { setTimeout as sleep } from "node:timers/promises"; +import { describe, expect, vi } from "vitest"; +import { RunChangeNotifier } from "~/services/realtime/runChangeNotifier.server"; + +function toRedisOptions(redisOptions: { host?: string; port?: number; password?: string }) { + return { + host: redisOptions.host, + port: redisOptions.port, + password: redisOptions.password, + tlsDisabled: true, + clusterMode: false, + }; +} + +// Time for a SUBSCRIBE to register server-side before we publish. +const SUBSCRIBE_SETTLE_MS = 250; + +describe("RunChangeNotifier", () => { + redisTest( + "delivers a published change to a subscriber", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const subscription = notifier.subscribeToRunChanges("run_1"); + expect(notifier.activeSubscriptionCount).toBe(1); + + let resolved = false; + void subscription.changed.then(() => { + resolved = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1" }); + + await vi.waitFor(() => expect(resolved).toBe(true), { timeout: 5_000, interval: 50 }); + + subscription.unsubscribe(); + // Cleanup is deferred until Redis confirms UNSUBSCRIBE (avoids a + // subscribe/unsubscribe race), so the count converges to 0 asynchronously. + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "does not wake a subscriber for a different run", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const subscription = notifier.subscribeToRunChanges("run_a"); + let resolved = false; + void subscription.changed.then(() => { + resolved = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_b" }); + await sleep(500); + + expect(resolved).toBe(false); + subscription.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "refcounts subscriptions per run and wakes all waiters", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const first = notifier.subscribeToRunChanges("run_x"); + const second = notifier.subscribeToRunChanges("run_x"); + + // Two waiters, one distinct channel. + expect(notifier.activeSubscriptionCount).toBe(1); + + let firstResolved = false; + let secondResolved = false; + void first.changed.then(() => (firstResolved = true)); + void second.changed.then(() => (secondResolved = true)); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_x" }); + + await vi.waitFor(() => expect(firstResolved && secondResolved).toBe(true), { + timeout: 5_000, + interval: 50, + }); + + // Channel stays until the last waiter unsubscribes. Dropping one waiter only + // shrinks the listener set (no UNSUBSCRIBE), so the count is still 1 synchronously. + first.unsubscribe(); + expect(notifier.activeSubscriptionCount).toBe(1); + // The last unsubscribe issues UNSUBSCRIBE; the channel is dropped once Redis confirms. + second.unsubscribe(); + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "publish with no subscribers is a harmless no-op", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + expect(() => notifier.publish({ runId: "nobody_listening" })).not.toThrow(); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "wakes an env subscriber when a run in that env changes (tag-list feed)", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const envSub = notifier.subscribeToEnvChanges("env_1"); + let envWoke = false; + void envSub.changed.then(() => { + envWoke = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + // A run change WITH an environmentId fans out to the per-env channel. + notifier.publish({ runId: "run_1", environmentId: "env_1" }); + + await vi.waitFor(() => expect(envWoke).toBe(true), { timeout: 5_000, interval: 50 }); + envSub.unsubscribe(); + await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { + timeout: 5_000, + interval: 50, + }); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "does not wake an env subscriber for a different env, nor when env is omitted", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const envSub = notifier.subscribeToEnvChanges("env_a"); + let envWoke = false; + void envSub.changed.then(() => { + envWoke = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", environmentId: "env_b" }); // different env + notifier.publish({ runId: "run_2" }); // no env -> per-run channel only + await sleep(500); + + expect(envWoke).toBe(false); + envSub.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "re-subscribing right after the last unsubscribe still delivers", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); + try { + const first = notifier.subscribeToRunChanges("run_race"); + await sleep(SUBSCRIBE_SETTLE_MS); + + // Drop the last waiter (issues UNSUBSCRIBE) and immediately re-subscribe before + // it can settle. The channel must end up subscribed so the new waiter wakes. + first.unsubscribe(); + const second = notifier.subscribeToRunChanges("run_race"); + let woke = false; + void second.changed.then(() => { + woke = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_race" }); + + await vi.waitFor(() => expect(woke).toBe(true), { timeout: 5_000, interval: 50 }); + second.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); +}); diff --git a/apps/webapp/test/realtime/runReaderProjection.test.ts b/apps/webapp/test/realtime/runReaderProjection.test.ts new file mode 100644 index 00000000000..22ba3ac72fa --- /dev/null +++ b/apps/webapp/test/realtime/runReaderProjection.test.ts @@ -0,0 +1,57 @@ +import { describe, expect, it, vi } from "vitest"; +import { buildHydratorSelect, RunHydrator } from "~/services/realtime/runReader.server"; + +describe("buildHydratorSelect", () => { + it("returns the full select when nothing is skipped", () => { + const select = buildHydratorSelect([]); + expect(select.id).toBe(true); + expect(select.payload).toBe(true); + expect(select.output).toBe(true); + expect(select.metadata).toBe(true); + expect(select.error).toBe(true); + }); + + it("drops skipped columns but always keeps id + updatedAt", () => { + const select = buildHydratorSelect(["payload", "output", "metadata", "error"]); + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.metadata).toBeUndefined(); + expect(select.error).toBeUndefined(); + // Needed internally regardless of skipColumns (keys the row, drives the diff/offset). + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + // A non-skipped column survives. + expect(select.status).toBe(true); + }); +}); + +describe("RunHydrator.hydrateByIds column projection", () => { + function makeHydrator() { + let capturedSelect: Record | undefined; + const replica = { + taskRun: { + findMany: vi.fn(async ({ select }: { select: Record }) => { + capturedSelect = select; + return []; + }), + }, + } as any; + return { hydrator: new RunHydrator({ replica }), getSelect: () => capturedSelect }; + } + + it("projects the SELECT by skipColumns", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"], ["payload", "output"]); + const select = getSelect()!; + expect(select.payload).toBeUndefined(); + expect(select.output).toBeUndefined(); + expect(select.id).toBe(true); + expect(select.updatedAt).toBe(true); + }); + + it("selects the full column set when no skipColumns are given", async () => { + const { hydrator, getSelect } = makeHydrator(); + await hydrator.hydrateByIds("env_1", ["run_1"]); + expect(getSelect()!.payload).toBe(true); + }); +}); diff --git a/apps/webapp/test/realtime/shadowCompare.test.ts b/apps/webapp/test/realtime/shadowCompare.test.ts new file mode 100644 index 00000000000..31bb527589f --- /dev/null +++ b/apps/webapp/test/realtime/shadowCompare.test.ts @@ -0,0 +1,212 @@ +import { + type RealtimeRunRow, + serializeRunRow, +} from "~/services/realtime/electricStreamProtocol.server"; +import { type RunListFilter } from "~/services/realtime/runReader.server"; +import { RealtimeShadowComparator } from "~/services/realtime/shadowCompare.server"; +import { describe, expect, it } from "vitest"; + +function sampleRow(overrides: Partial = {}): RealtimeRunRow { + return { + id: "run_a", + taskIdentifier: "my-task", + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date("2026-06-07T10:05:30.123Z"), + startedAt: null, + delayUntil: null, + queuedAt: null, + expiredAt: null, + completedAt: null, + friendlyId: "run_friendly_a", + number: 7, + isTest: true, + status: "EXECUTING", + usageDurationMs: 1234, + costInCents: 0.55, + baseCostInCents: 0.25, + ttl: "1h", + payload: '{"hello":"world"}', + payloadType: "application/json", + metadata: null, + metadataType: "application/json", + output: null, + outputType: "application/json", + runTags: ["a", "b"], + error: null, + realtimeStreams: [], + ...overrides, + }; +} + +const UP_TO_DATE = { headers: { control: "up-to-date" } }; + +function insert(value: Record) { + return { key: `"public"."TaskRun"/"${value.id}"`, value, headers: { operation: "insert" } }; +} + +function makeComparator( + rowsById: Record, + resolvedIds: string[] = [] +) { + return new RealtimeShadowComparator({ + runReader: { getRunById: async (_env: string, id: string) => rowsById[id] ?? null } as any, + runListResolver: { resolveMatchingRunIds: async (_f: RunListFilter) => resolvedIds } as any, + }); +} + +describe("RealtimeShadowComparator serialization", () => { + it("counts a faithful re-serialization as a match", async () => { + const row = sampleRow(); + const body = JSON.stringify([insert(serializeRunRow(row)), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + expect(out.serializationSkew).toBe(0); + expect(out.diffs).toEqual([]); + }); + + it("does not flag semantically-equivalent but differently-encoded values", async () => { + const row = sampleRow(); + // Electric encodes bool as "true" (notifier uses "t"), a number with a trailing + // zero, and a timestamp without millis — all equal after decoding. + const value = { + ...serializeRunRow(row), + isTest: "true", + costInCents: "0.5500", + createdAt: "2026-06-07T09:00:00", + }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationMatched).toBe(1); + expect(out.serializationDiverged).toBe(0); + }); + + it("flags a genuine column divergence (same version)", async () => { + const row = sampleRow(); + const value = { ...serializeRunRow(row), payload: '{"hello":"TAMPERED"}' }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.diffs).toEqual([ + { runId: "run_a", column: "payload", electric: '{"hello":"TAMPERED"}', notifier: '{"hello":"world"}' }, + ]); + }); + + it("treats DEQUEUED/EXECUTING as equivalent (legacy status rewrite)", async () => { + const row = sampleRow({ status: "EXECUTING" }); + const value = { ...serializeRunRow(row), status: "DEQUEUED" }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationDiverged).toBe(0); + expect(out.serializationMatched).toBe(1); + }); + + it("records skew when the row advanced between emit and refetch", async () => { + const row = sampleRow(); + // Electric emitted an older version; the refetched row is newer. + const value = { ...serializeRunRow(sampleRow({ updatedAt: new Date("2026-06-07T10:00:00.000Z") })) }; + const body = JSON.stringify([insert(value), UP_TO_DATE]); + const cmp = makeComparator({ run_a: row }); + + const out = await cmp.compare({ + feed: "run", + electricBody: body, + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + }); + + expect(out.serializationSkew).toBe(1); + expect(out.serializationMatched).toBe(0); + expect(out.serializationDiverged).toBe(0); + }); +}); + +describe("RealtimeShadowComparator membership", () => { + const filter: RunListFilter = { + organizationId: "org_1", + projectId: "proj_1", + environmentId: "env_1", + tags: ["t"], + createdAtAfter: new Date("2026-06-06T00:00:00.000Z"), + limit: 1000, + }; + + function bodyFor(ids: string[]) { + const msgs = ids.map((id) => insert(serializeRunRow(sampleRow({ id })))); + return JSON.stringify([...msgs, UP_TO_DATE]); + } + + it("matches when Electric's set equals the notifier resolver's set", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "b"] + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(true); + expect(out.missingInNotifier).toEqual([]); + expect(out.extraInNotifier).toEqual([]); + }); + + it("reports rows missing from / extra in the notifier resolution", async () => { + const cmp = makeComparator( + { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, + ["a", "c"] // notifier missing b, has extra c + ); + const out = await cmp.compare({ + feed: "runs", + electricBody: bodyFor(["a", "b"]), + environment: { id: "env_1" }, + skipColumns: [], + isInitialSnapshot: true, + membershipFilter: filter, + }); + expect(out.membershipMatch).toBe(false); + expect(out.missingInNotifier).toEqual(["b"]); + expect(out.extraInNotifier).toEqual(["c"]); + }); +}); From 13d2511170b48b84beeb26e4883d4a2f345106cc Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 10:39:51 +0100 Subject: [PATCH 02/23] fix(webapp): harden the realtime runs backend Addresses review feedback on the new backend: - skip cache eviction when updating an existing key at capacity - treat a concurrency limit of 0 as valid (enforce it, not a 500) - gate subscribeToRunChanges behind the enable switch - keep protocol-reserved columns in the hydration projection - re-clamp a handle-recovered createdAt to the max-age floor - bulk-hydrate the shadow comparator instead of per-run reads - log only run id and column on divergence, never raw cell values --- .../app/services/realtime/boundedTtlCache.ts | 4 +- .../realtime/notifierRealtimeClient.server.ts | 16 +++++- .../runChangeNotifierInstance.server.ts | 3 ++ .../app/services/realtime/runReader.server.ts | 4 +- .../services/realtime/shadowCompare.server.ts | 10 +++- .../realtime/shadowRealtimeClient.server.ts | 4 +- .../test/realtime/boundedTtlCache.test.ts | 11 ++++ .../test/realtime/notifierRunSetCache.test.ts | 54 +++++++++++++++++++ .../test/realtime/runReaderProjection.test.ts | 18 +++++++ .../test/realtime/shadowCompare.test.ts | 6 ++- 10 files changed, 122 insertions(+), 8 deletions(-) diff --git a/apps/webapp/app/services/realtime/boundedTtlCache.ts b/apps/webapp/app/services/realtime/boundedTtlCache.ts index 643f23607c5..8efcde55609 100644 --- a/apps/webapp/app/services/realtime/boundedTtlCache.ts +++ b/apps/webapp/app/services/realtime/boundedTtlCache.ts @@ -34,7 +34,9 @@ export class BoundedTtlCache { } set(key: string, value: V): void { - if (this.#entries.size >= this.maxEntries) { + // Only run capacity eviction when inserting a NEW key — updating an existing key + // doesn't grow the map, so it must never drop an unrelated live entry. + if (!this.#entries.has(key) && this.#entries.size >= this.maxEntries) { const now = Date.now(); for (const [key, entry] of this.#entries) { if (entry.expiresAt <= now) { diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts index 9c70fd1acb9..38874b2de4b 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -238,10 +238,15 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { } // Recover the pinned window from the handle so the lower bound never drifts. + // Re-clamp the recovered value to the max-age floor so a stale or crafted handle + // can't widen the lookback past the configured ceiling. + const recoveredMs = this.#filterMsFromHandle(handle); const filter: RunSetFilter = { tags, createdAtAfter: new Date( - this.#filterMsFromHandle(handle) ?? this.#computeCreatedAtFilter(params.createdAt).getTime() + recoveredMs !== undefined + ? this.#clampCreatedAtFloor(recoveredMs) + : this.#computeCreatedAtFilter(params.createdAt).getTime() ), }; @@ -573,6 +578,13 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { return bucket > 0 ? Math.floor(ms / bucket) * bucket : ms; } + /** Clamp a handle-recovered createdAt lower bound up to the max-age floor (so a + * stale or crafted handle can't widen the window past the ceiling), then re-bucket. */ + #clampCreatedAtFloor(ms: number): number { + const floorMs = Date.now() - this.options.maximumCreatedAtFilterAgeMs; + return this.#bucketCreatedAtMs(Math.max(ms, floorMs)); + } + #mintListHandle(createdAtFilterMs: number): string { // Pins the createdAt threshold in the opaque handle so live polls reuse the // same lower bound even on a working-set cache miss. @@ -615,7 +627,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { DEFAULT_CONCURRENCY_LIMIT ); - if (!concurrencyLimit) { + if (concurrencyLimit == null) { logger.error("[notifierRealtimeClient] Failed to get concurrency limit", { organizationId: environment.organizationId, }); diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index 545887abc61..71001192c1a 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -69,5 +69,8 @@ export function publishManyRunChanged(inputs: RunChangeInput[]): void { /** Subscribe to the next change for a run via the shared subscriber. */ export function subscribeToRunChanges(runId: string): RunChangeSubscription { + if (!notifierEnabled) { + throw new Error("Run change notifier is disabled"); + } return getRunChangeNotifier().subscribeToRunChanges(runId); } diff --git a/apps/webapp/app/services/realtime/runReader.server.ts b/apps/webapp/app/services/realtime/runReader.server.ts index 6fe59c3c059..4135e94366b 100644 --- a/apps/webapp/app/services/realtime/runReader.server.ts +++ b/apps/webapp/app/services/realtime/runReader.server.ts @@ -1,6 +1,6 @@ import { type Prisma, type PrismaClient } from "@trigger.dev/database"; import { BoundedTtlCache } from "./boundedTtlCache"; -import { type RealtimeRunRow } from "./electricStreamProtocol.server"; +import { RESERVED_COLUMNS, type RealtimeRunRow } from "./electricStreamProtocol.server"; /** * RunReader — the pluggable read half of the notifier-backed realtime feed. @@ -52,7 +52,7 @@ export const RUN_HYDRATOR_SELECT = { * `buildHydratorSelect`), so the replica doesn't ship large `payload`/`output`/ * `metadata`/`error` columns the response will drop anyway. */ -const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt"]); +const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt", ...RESERVED_COLUMNS]); /** Project `RUN_HYDRATOR_SELECT` down to the columns the client didn't skip (plus * the always-needed ones). An empty skip set returns the full select unchanged. */ diff --git a/apps/webapp/app/services/realtime/shadowCompare.server.ts b/apps/webapp/app/services/realtime/shadowCompare.server.ts index 9a30d93c4da..b24540bfca3 100644 --- a/apps/webapp/app/services/realtime/shadowCompare.server.ts +++ b/apps/webapp/app/services/realtime/shadowCompare.server.ts @@ -95,13 +95,21 @@ export class RealtimeShadowComparator { diffs: [], }; + // Bulk-hydrate every emitted run in one query rather than a per-message round + // trip, so shadow mode doesn't inflate the very replica load it's measuring. + const emittedIds = changes + .map((m) => m.value.id) + .filter((id): id is string => typeof id === "string"); + const hydrated = await this.options.runReader.hydrateByIds(input.environment.id, emittedIds); + const rowsById = new Map(hydrated.map((row) => [row.id, row])); + for (const message of changes) { const runId = message.value.id ?? undefined; if (!runId) { continue; } - const row = await this.options.runReader.getRunById(input.environment.id, runId); + const row = rowsById.get(runId); if (!row) { // Run no longer readable (deleted / replica miss). Not a serialization divergence. outcome.serializationSkew++; diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts index 1ddf162fd87..b66b70e7ad5 100644 --- a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts @@ -180,7 +180,9 @@ export class ShadowRealtimeClient implements RealtimeStreamClient { membershipMatch: outcome.membershipMatch, missingInNotifier: outcome.missingInNotifier?.slice(0, 20), extraInNotifier: outcome.extraInNotifier?.slice(0, 20), - diffs: outcome.diffs, + // Log only which run/column diverged, never the raw cell values — they can + // include run payload/output/metadata and must not leak into logs. + diffs: outcome.diffs.map(({ runId, column }) => ({ runId, column })), }); } } diff --git a/apps/webapp/test/realtime/boundedTtlCache.test.ts b/apps/webapp/test/realtime/boundedTtlCache.test.ts index e487798750e..a3fb0b1e425 100644 --- a/apps/webapp/test/realtime/boundedTtlCache.test.ts +++ b/apps/webapp/test/realtime/boundedTtlCache.test.ts @@ -28,6 +28,17 @@ describe("BoundedTtlCache", () => { expect(cache.size).toBe(0); }); + it("does not evict another entry when updating an existing key at capacity", () => { + const cache = new BoundedTtlCache(60_000, 2); + cache.set("a", 1); + cache.set("b", 2); + // Updating an existing key doesn't grow the map, so it must not drop "b". + cache.set("a", 11); + expect(cache.get("a")).toBe(11); + expect(cache.get("b")).toBe(2); + expect(cache.size).toBe(2); + }); + it("drops the oldest entry when full of still-live entries", () => { const cache = new BoundedTtlCache(60_000, 2); cache.set("a", 1); diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts index a0beb0fd728..90e3446e792 100644 --- a/apps/webapp/test/realtime/notifierRunSetCache.test.ts +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -171,3 +171,57 @@ describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { } }); }); + +describe("NotifierRealtimeClient review fixes", () => { + const ready = { changed: Promise.resolve(), unsubscribe() {} }; + const liveNotifier = { subscribeToRunChanges: () => ready, subscribeToEnvChanges: () => ready }; + + it("clamps a stale/crafted handle's createdAt up to the max-age floor", async () => { + const maxAge = 24 * 60 * 60 * 1000; + const { client, resolveSpy } = makeClient({ + notifier: liveNotifier, + maximumCreatedAtFilterAgeMs: maxAge, + runSetCreatedAtBucketMs: 0, + livePollTimeoutMs: 50, + }); + const before = Date.now(); + // Handle encodes createdAt = 1ms epoch, far older than the 24h ceiling. + await client.streamRuns( + "http://localhost:3030/realtime/v1/runs?offset=123_1&live=true&handle=runs_1_7", + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + const passed = resolveSpy.mock.calls[0][0].createdAtAfter as Date; + // Clamped to ~now - maxAge, not the epoch value encoded in the handle. + expect(passed.getTime()).toBeGreaterThan(before - maxAge - 1_000); + }); + + it("enforces a concurrency limit of 0 instead of failing with a 500", async () => { + let limitCheckedWith: number | undefined; + const { client } = makeClient({ + notifier: liveNotifier, + cachedLimitProvider: { getCachedLimit: async () => 0 }, + limiter: { + incrementAndCheck: async (_env: string, _id: string, limit: number) => { + limitCheckedWith = limit; + return true; + }, + decrement: async () => {}, + }, + livePollTimeoutMs: 50, + }); + const res = await client.streamBatch( + "http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&live=true", + ENV, + "batch_1", + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + expect(res.status).toBe(200); + expect(limitCheckedWith).toBe(0); + }); +}); diff --git a/apps/webapp/test/realtime/runReaderProjection.test.ts b/apps/webapp/test/realtime/runReaderProjection.test.ts index 22ba3ac72fa..07aebf92589 100644 --- a/apps/webapp/test/realtime/runReaderProjection.test.ts +++ b/apps/webapp/test/realtime/runReaderProjection.test.ts @@ -11,6 +11,24 @@ describe("buildHydratorSelect", () => { expect(select.error).toBe(true); }); + it("keeps protocol-reserved columns even when asked to skip them", () => { + // Reserved columns are always emitted by the serializer, so hydration must keep + // them regardless of skipColumns or the output is null/incorrect. + const select = buildHydratorSelect([ + "status", + "taskIdentifier", + "createdAt", + "friendlyId", + "payload", + ]); + expect(select.status).toBe(true); + expect(select.taskIdentifier).toBe(true); + expect(select.createdAt).toBe(true); + expect(select.friendlyId).toBe(true); + // A non-reserved skipped column is still dropped. + expect(select.payload).toBeUndefined(); + }); + it("drops skipped columns but always keeps id + updatedAt", () => { const select = buildHydratorSelect(["payload", "output", "metadata", "error"]); expect(select.payload).toBeUndefined(); diff --git a/apps/webapp/test/realtime/shadowCompare.test.ts b/apps/webapp/test/realtime/shadowCompare.test.ts index 31bb527589f..e6604a02cd6 100644 --- a/apps/webapp/test/realtime/shadowCompare.test.ts +++ b/apps/webapp/test/realtime/shadowCompare.test.ts @@ -49,7 +49,11 @@ function makeComparator( resolvedIds: string[] = [] ) { return new RealtimeShadowComparator({ - runReader: { getRunById: async (_env: string, id: string) => rowsById[id] ?? null } as any, + runReader: { + getRunById: async (_env: string, id: string) => rowsById[id] ?? null, + hydrateByIds: async (_env: string, ids: string[]) => + ids.map((id) => rowsById[id]).filter((row): row is RealtimeRunRow => Boolean(row)), + } as any, runListResolver: { resolveMatchingRunIds: async (_f: RunListFilter) => resolvedIds } as any, }); } From e0666404da97fcbffe8884eebeeb2b2f36f59edc Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 10:56:23 +0100 Subject: [PATCH 03/23] fix(webapp): enforce the realtime tag/batch result cap exactly The id resolver returned the repository's has-more overfetch (size + 1), so the feed could emit one run past the configured cap. Trim to the limit. --- .../services/realtime/clickHouseRunListResolver.server.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts index 545c4a43211..16dda7838b7 100644 --- a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -27,7 +27,7 @@ export class ClickHouseRunListResolver implements RunListResolver { const clickhouse = await this.options.getClickhouse(filter.organizationId); const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); - return repository.listRunIds({ + const ids = await repository.listRunIds({ organizationId: filter.organizationId, projectId: filter.projectId, environmentId: filter.environmentId, @@ -36,5 +36,9 @@ export class ClickHouseRunListResolver implements RunListResolver { from: filter.createdAtAfter?.getTime(), page: { size: filter.limit }, }); + + // listRunIds overfetches by one (size + 1) for has-more detection and doesn't + // trim, so enforce the caller's cap here. + return ids.slice(0, filter.limit); } } From fdadc18d682e6d63061491723c627ac942a3f907 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 12:08:21 +0100 Subject: [PATCH 04/23] feat(webapp): give the realtime runs feed its own ClickHouse pool Resolve tag/batch run ids on a dedicated REALTIME_RUNS_CLICKHOUSE_* pool (falling back to CLICKHOUSE_URL) so the feed can't contend with the shared analytics client. --- apps/webapp/app/env.server.ts | 14 ++++++ .../clickhouse/clickhouseFactory.server.ts | 49 ++++++++++++++++++- .../notifierRealtimeClientInstance.server.ts | 2 +- .../shadowRealtimeClientInstance.server.ts | 2 +- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 3cdfdbf51fc..c0c61912414 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1633,6 +1633,20 @@ const EnvironmentSchema = z .enum(["log", "error", "warn", "info", "debug"]) .default("info"), RUN_ENGINE_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), + // ClickHouse client used by the realtime runs feed for tag/batch id resolution. + // Kept on its own URL + pool so the feed's reads can't contend with the main + // analytics client (CLICKHOUSE_URL). Falls back to the main URL when unset. + REALTIME_RUNS_CLICKHOUSE_URL: z + .string() + .optional() + .transform((v) => v ?? process.env.CLICKHOUSE_URL), + REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED: z.string().default("1"), + REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(), + REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10), + REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL: z + .enum(["log", "error", "warn", "info", "debug"]) + .default("info"), + REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), EVENTS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(1000), EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000), METRICS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(10000), diff --git a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts index fb7f384fd27..c563621408c 100644 --- a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts +++ b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts @@ -211,6 +211,36 @@ function initializeRunEngineClickhouseClient(): ClickHouse { }); } +/** Realtime runs feed tag/batch id resolution (`REALTIME_RUNS_CLICKHOUSE_URL`); + * falls back to the default client if unset. */ +const defaultRealtimeClickhouseClient = singleton( + "realtimeClickhouseClient", + initializeRealtimeClickhouseClient +); + +function initializeRealtimeClickhouseClient(): ClickHouse { + if (!env.REALTIME_RUNS_CLICKHOUSE_URL) { + return defaultClickhouseClient; + } + + const url = new URL(env.REALTIME_RUNS_CLICKHOUSE_URL); + url.searchParams.delete("secure"); + + return new ClickHouse({ + url: url.toString(), + name: "realtime-runs-clickhouse", + keepAlive: { + enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); +} + /** Task events (`EVENTS_CLICKHOUSE_URL`); not exported — accessed via factory. */ const defaultEventsClickhouseClient = singleton( "eventsClickhouseClient", @@ -257,7 +287,8 @@ export type ClientType = | "logs" | "query" | "admin" - | "engine"; + | "engine" + | "realtime"; function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHouse { const parsed = new URL(url); @@ -330,6 +361,20 @@ function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHou }, maxOpenConnections: env.RUN_ENGINE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, }); + case "realtime": + return new ClickHouse({ + url: parsed.toString(), + name, + keepAlive: { + enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + }, + logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + compression: { + request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + }, + maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + }); case "standard": case "query": case "admin": @@ -398,6 +443,8 @@ export class ClickhouseFactory { return defaultAdminClickhouseClient; case "engine": return defaultRunEngineClickhouseClient; + case "realtime": + return defaultRealtimeClickhouseClient; } } diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts index 2888deec863..1b645eb5fb8 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts @@ -55,7 +55,7 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { runReader: new RunHydrator({ replica: $replica }), runListResolver: new ClickHouseRunListResolver({ getClickhouse: (organizationId) => - clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), prisma: $replica, }), notifier: getRunChangeNotifier(), diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts index 36ce0a4325b..95edc82620d 100644 --- a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -26,7 +26,7 @@ function initializeShadowRealtimeClient(): ShadowRealtimeClient { runReader: new RunHydrator({ replica: $replica }), runListResolver: new ClickHouseRunListResolver({ getClickhouse: (organizationId) => - clickhouseFactory.getClickhouseForOrganization(organizationId, "standard"), + clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), prisma: $replica, }), }); From 5c42d55da6548e7961b2b12555a6f6cb9fdba0fb Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 12:10:35 +0100 Subject: [PATCH 05/23] fix(webapp): log realtime run-change pub/sub failures at error level Surface publish, subscribe, and unsubscribe failures in the realtime run-change pub/sub at error level with clearer static messages, instead of debug. --- .../realtime/runChangeNotifier.server.ts | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts index ba8748c6cf4..9bc0f69d6e6 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -78,11 +78,17 @@ export class RunChangeNotifier { const result = publisher.publish(channel, payload); if (typeof (result as Promise)?.catch === "function") { (result as Promise).catch((error) => { - logger.debug("[runChangeNotifier] publish failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); }); } } catch (error) { - logger.debug("[runChangeNotifier] publish threw", { error, channel }); + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); } } @@ -125,7 +131,10 @@ export class RunChangeNotifier { listeners = new Set(); this.#listeners.set(channel, listeners); subscriber.subscribe(channel).catch((error) => { - logger.debug("[runChangeNotifier] subscribe failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to subscribe to run-change channel", { + error, + channel, + }); }); } listeners.add(resolveChanged); @@ -161,7 +170,10 @@ export class RunChangeNotifier { // now unsubscribed in Redis but has live waiters. Re-subscribe so they // still receive messages (the long-poll backstop covers the gap). subscriber.subscribe(channel).catch((error) => { - logger.debug("[runChangeNotifier] resubscribe failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to re-subscribe to run-change channel", { + error, + channel, + }); }); } }) @@ -169,7 +181,10 @@ export class RunChangeNotifier { // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. // Keep the (empty) map entry so a future subscriber reuses it without a // duplicate SUBSCRIBE and #onMessage stays consistent with Redis state. - logger.debug("[runChangeNotifier] unsubscribe failed", { error, channel }); + logger.error("[runChangeNotifier] Failed to unsubscribe from run-change channel", { + error, + channel, + }); }); } }; From 75848cf84d4396ef08aeba24f80a543b54a825d1 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 13:47:43 +0100 Subject: [PATCH 06/23] feat(webapp): give the realtime runs feed its own pub/sub Redis Run the realtime runs feed's run-changed pub/sub on a dedicated REALTIME_RUNS_PUBSUB_REDIS_* connection set (falling back to PUBSUB_REDIS_* / REDIS_*), so its publish/subscribe traffic can be isolated from the shared pub/sub Redis. --- apps/webapp/app/env.server.ts | 31 +++++++++++++++++++ .../runChangeNotifierInstance.server.ts | 12 +++---- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c0c61912414..4920355f68f 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -357,6 +357,37 @@ const EnvironmentSchema = z PUBSUB_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z.string().default("0"), + // Dedicated pub/sub Redis for the realtime runs feed's run-changed notifier, so + // its publish/subscribe traffic can run on its own instance. Each value falls + // back to the shared PUBSUB_REDIS_* (then REDIS_*) when unset, so the default is + // unchanged until explicitly pointed at a dedicated instance. + REALTIME_RUNS_PUBSUB_REDIS_HOST: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_HOST ?? process.env.REDIS_HOST), + REALTIME_RUNS_PUBSUB_REDIS_PORT: z.coerce + .number() + .optional() + .transform((v) => { + if (v !== undefined) return v; + const raw = process.env.PUBSUB_REDIS_PORT ?? process.env.REDIS_PORT; + return raw ? parseInt(raw) : undefined; + }), + REALTIME_RUNS_PUBSUB_REDIS_USERNAME: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_USERNAME ?? process.env.REDIS_USERNAME), + REALTIME_RUNS_PUBSUB_REDIS_PASSWORD: z + .string() + .optional() + .transform((v) => v ?? process.env.PUBSUB_REDIS_PASSWORD ?? process.env.REDIS_PASSWORD), + REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED: z + .string() + .default(process.env.PUBSUB_REDIS_TLS_DISABLED ?? process.env.REDIS_TLS_DISABLED ?? "false"), + REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z + .string() + .default(process.env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED ?? "0"), + DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(100), DEFAULT_ENV_EXECUTION_CONCURRENCY_BURST_FACTOR: z.coerce.number().default(1.0), DEFAULT_ORG_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(300), diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index 71001192c1a..78f68537c70 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -21,12 +21,12 @@ const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; function initializeRunChangeNotifier(): RunChangeNotifier { const notifier = new RunChangeNotifier({ redis: { - host: env.PUBSUB_REDIS_HOST, - port: env.PUBSUB_REDIS_PORT, - username: env.PUBSUB_REDIS_USERNAME, - password: env.PUBSUB_REDIS_PASSWORD, - tlsDisabled: env.PUBSUB_REDIS_TLS_DISABLED === "true", - clusterMode: env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", + host: env.REALTIME_RUNS_PUBSUB_REDIS_HOST, + port: env.REALTIME_RUNS_PUBSUB_REDIS_PORT, + username: env.REALTIME_RUNS_PUBSUB_REDIS_USERNAME, + password: env.REALTIME_RUNS_PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode: env.REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", }, }); From 2b3794618d41e691b1f3f7dcfb1bb6bf050d6a7a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 14:55:51 +0100 Subject: [PATCH 07/23] fix(webapp): adapt the realtime run-id resolver to paginated listRunIds listRunIds now returns a keyset page ({ runIds, pagination }); read runIds from it. The page is already capped to the requested size, so the manual trim is gone. Also make the run-change event-bus handler registration return a truthy value so the singleton() wrapper doesn't re-attach listeners on dev reloads. --- .../services/realtime/clickHouseRunListResolver.server.ts | 7 +++---- .../services/realtime/runChangeNotifierHandlers.server.ts | 7 ++++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts index 16dda7838b7..003646bb74a 100644 --- a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -27,7 +27,7 @@ export class ClickHouseRunListResolver implements RunListResolver { const clickhouse = await this.options.getClickhouse(filter.organizationId); const repository = new RunsRepository({ clickhouse, prisma: this.options.prisma }); - const ids = await repository.listRunIds({ + const { runIds } = await repository.listRunIds({ organizationId: filter.organizationId, projectId: filter.projectId, environmentId: filter.environmentId, @@ -37,8 +37,7 @@ export class ClickHouseRunListResolver implements RunListResolver { page: { size: filter.limit }, }); - // listRunIds overfetches by one (size + 1) for has-more detection and doesn't - // trim, so enforce the caller's cap here. - return ids.slice(0, filter.limit); + // listRunIds is keyset-paginated; runIds is already capped to page.size (= limit). + return runIds; } } diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts index 791991178e4..9ed93e66a4a 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -16,8 +16,11 @@ import { publishRunChanged } from "./runChangeNotifierInstance.server"; * high-value, env-cheap transitions here. */ export function registerRunChangeNotifierHandlers() { + // Return a truthy value in every path so the singleton() wrapper (which uses + // ??=) caches the result and never re-runs this factory — re-running would + // attach duplicate engine-bus listeners on each Remix dev-mode reload. if (env.REALTIME_NOTIFIER_ENABLED !== "1") { - return; + return true; } // Status transitions (checkpoint suspend/resume, pending version, dequeue) — @@ -70,4 +73,6 @@ export function registerRunChangeNotifierHandlers() { }); logger.info("[runChangeNotifier] realtime run-change notifier handlers registered"); + + return true; } From 57755b8b5c58c99473e797f28d07c07960f40f74 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Mon, 8 Jun 2026 15:48:19 +0100 Subject: [PATCH 08/23] fix(webapp): JSON-encode the run-set cache key to avoid separator collisions A tag containing a comma keyed the same as two separate tags, so the resolve+hydrate coalescing cache could serve the wrong runs for up to its TTL. Encode the tag/column arrays instead of joining them. --- .../app/services/realtime/notifierRealtimeClient.server.ts | 7 +++++-- apps/webapp/test/realtime/notifierRunSetCache.test.ts | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts index 38874b2de4b..9c49e62e4c4 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -529,8 +529,11 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { /** Stable cache key for the resolve+hydrate cache. Same key => same id-set and the * same projected columns, so cached rows always match the requesting feed. */ #runSetCacheKey(environmentId: string, filter: RunSetFilter, skipColumns: string[]): string { - const tags = filter.tags && filter.tags.length > 0 ? [...filter.tags].sort().join(",") : ""; - const cols = skipColumns.length > 0 ? [...skipColumns].sort().join(",") : ""; + // JSON-encode the arrays (not a join) so a value containing the separators — + // e.g. a tag with a comma — can't collide: ["a,b"] must not key the same as + // ["a","b"], which are different ClickHouse filters. + const tags = filter.tags && filter.tags.length > 0 ? JSON.stringify([...filter.tags].sort()) : ""; + const cols = skipColumns.length > 0 ? JSON.stringify([...skipColumns].sort()) : ""; const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; return `${environmentId}|${tags}|${filter.batchId ?? ""}|${ filter.createdAtAfter?.getTime() ?? "" diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts index 90e3446e792..2f325296f1c 100644 --- a/apps/webapp/test/realtime/notifierRunSetCache.test.ts +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -157,6 +157,13 @@ describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { expect(resolveSpy).toHaveBeenCalledTimes(2); }); + it("does not collide a comma-containing tag with two separate tags", async () => { + const { client, resolveSpy } = makeClient({ runSetCreatedAtBucketMs: 60 * 60_000 }); + await snapshotTag(client, ["a,b"]); // one tag "a,b" + await snapshotTag(client, ["a", "b"]); // two tags a OR b — a different filter + expect(resolveSpy).toHaveBeenCalledTimes(2); + }); + it("keeps each feed's exact lower bound when bucketing is disabled (0)", async () => { vi.useFakeTimers({ toFake: ["Date"] }); vi.setSystemTime(new Date("2026-06-07T10:00:30.500Z")); From fbbd7831d2333b70c5dd75920aa7ae4d2a4f0c74 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 9 Jun 2026 09:11:25 +0100 Subject: [PATCH 09/23] perf(webapp): scale the realtime runs feed under high concurrency Keep a single busy environment from overwhelming the realtime runs feed: - Coalesce per-environment wake notifications to a bounded rate, so a high-throughput environment wakes its subscribers at a steady cap instead of once per run change. - Hold a multi-run live poll open on an empty result instead of returning an immediate up-to-date, cutting wasted round-trips when a wake does not match a subscriber's filter. - Support Redis Cluster sharded pub/sub (SSUBSCRIBE/SPUBLISH) so the feed's pub/sub scales horizontally across shards by environment/run. All behind the existing feature flag and tunable env vars. Bumps ioredis to 5.6.x across the workspace (required for cluster sharded pub/sub). --- apps/supervisor/package.json | 2 +- apps/webapp/app/env.server.ts | 13 ++ .../realtime/notifierRealtimeClient.server.ts | 153 ++++++++++++------ .../notifierRealtimeClientInstance.server.ts | 1 + .../realtime/runChangeNotifier.server.ts | 114 +++++++++++-- .../runChangeNotifierInstance.server.ts | 11 +- apps/webapp/package.json | 2 +- .../test/realtime/notifierHoldOnEmpty.test.ts | 136 ++++++++++++++++ .../test/realtime/runChangeNotifier.test.ts | 105 ++++++++++++ internal-packages/redis/package.json | 2 +- internal-packages/testcontainers/package.json | 2 +- pnpm-lock.yaml | 26 +-- 12 files changed, 490 insertions(+), 77 deletions(-) create mode 100644 apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json index 7a3537dbc04..2725fe2b729 100644 --- a/apps/supervisor/package.json +++ b/apps/supervisor/package.json @@ -18,7 +18,7 @@ "@kubernetes/client-node": "^1.0.0", "@trigger.dev/core": "workspace:*", "dockerode": "^4.0.6", - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "p-limit": "^6.2.0", "prom-client": "^15.1.0", "socket.io": "4.7.4", diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 4920355f68f..265dc3497f0 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -324,6 +324,15 @@ const EnvironmentSchema = z // hydrate cache entry. Floored, so the window only ever widens by < bucket. 0 // disables bucketing (each feed keeps its exact lower bound). REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), + // Leading-edge throttle (ms) on the per-env wake channel: a busy env's run-change + // firehose is collapsed to at most one feed-wake per window, decoupling wake load + // from run throughput. Lossless because consumers refetch current state on a wake. + // 0 disables coalescing (every change wakes immediately). + REALTIME_NOTIFIER_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(100), + // When "1", a multi-run live poll woken by a change irrelevant to its filter keeps + // holding the long-poll (re-resolving cheaply) instead of returning an empty + // up-to-date the client would immediately re-issue. "0" reverts to per-wake replies. + REALTIME_NOTIFIER_HOLD_ON_EMPTY: z.string().default("1"), PUBSUB_REDIS_HOST: z .string() @@ -387,6 +396,10 @@ const EnvironmentSchema = z REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z .string() .default(process.env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED ?? "0"), + // Use sharded pub/sub (SSUBSCRIBE/SPUBLISH) when in cluster mode, so a busy env's + // traffic stays on one shard instead of broadcasting to every node. Only takes + // effect alongside CLUSTER_MODE_ENABLED. "0" forces classic pub/sub on the cluster. + REALTIME_RUNS_PUBSUB_REDIS_SHARDED_ENABLED: z.string().default("1"), DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(100), DEFAULT_ENV_EXECUTION_CONCURRENCY_BURST_FACTOR: z.coerce.number().default(1.0), diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts index 9c49e62e4c4..b0f6aade879 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -91,6 +91,11 @@ export type NotifierRealtimeClientOptions = { * same-tag feeds pinned within the same bucket share a cache entry. Defaults to * 60000. 0 disables bucketing. */ runSetCreatedAtBucketMs?: number; + /** When true (default), a multi-run live poll woken by a change irrelevant to its + * filter keeps holding the long-poll (re-resolving cheaply on each wake) instead of + * returning an empty up-to-date the client would immediately re-issue. The empty + * response is the dominant cost under a busy per-env wake channel. */ + holdOnEmpty?: boolean; /** Observability hook: why a live request woke (notify vs timeout vs abort). */ onWakeup?: (reason: WakeupReason) => void; /** Observability hook: whether a multi-run resolve hit the cache, coalesced onto @@ -417,55 +422,93 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { signal: AbortSignal | undefined ): Promise { return this.#withConcurrencySlot(environment, async () => { - // One env-scoped subscription per feed (not one per run): any run change in - // the env wakes us, then we re-resolve the filter. - const reason = await this.#waitForEnvChange(environment.id, signal); - this.options.onWakeup?.(reason); - - const cached = this.#workingSetCache.get(handle); const offsetFloorMs = parseOffsetUpdatedAtMs(offset); - const seq = this.#nextSeq(); + // Total time to hold this long-poll, jittered to avoid synchronized refetch herds. + const deadline = Date.now() + this.#jitteredTimeout(); + const holdOnEmpty = this.options.holdOnEmpty ?? true; + + // Working set we diff against: seeded from the cache (or the offset floor on a + // miss) and advanced on each refetch within this held request. + let prevSeen = this.#workingSetCache.get(handle); + + // The per-env channel wakes this feed on ANY run change in the environment, but + // most changes don't match this feed's filter. Rather than return an empty + // up-to-date the client would immediately re-issue (the dominant cost under a + // busy env), we hold the connection and only respond when THIS feed has a real + // delta or the backstop elapses. Each wake re-resolves via the coalesced + + // short-TTL cache, so an env-wide wake never fans out into per-feed CH+PG queries. + while (true) { + const remaining = deadline - Date.now(); + // One env-scoped subscription per wait (not one per run); re-subscribed each + // loop until a relevant delta or the budget runs out. + const reason = + remaining > 0 ? await this.#waitForEnvChange(environment.id, signal, remaining) : "timeout"; + this.options.onWakeup?.(reason); + + if (reason === "abort") { + // Client disconnected; the response is discarded. Skip the refetch. + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(this.#nextSeq()), + }); + } - // ClickHouse resolves the (possibly stale) membership; Postgres hydrates the - // authoritative current rows, so status is always fresh even if CH lags. The - // resolve+hydrate is coalesced + short-TTL cached so a single env-wide wake - // doesn't fan out into one CH+PG query per concurrent same-filter feed. - const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); - - // Diff against what the client already has, using the hydrated updatedAt: - // cache hit => per-row (new = insert, advanced = update); miss => anything - // newer than the offset floor as a merge-safe update. - const changes: RowChange[] = []; - const seen: WorkingSet = new Map(); - let maxUpdatedAt = offsetFloorMs; - for (const row of rows) { - const updatedAtMs = row.updatedAt.getTime(); - seen.set(row.id, updatedAtMs); - maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); - - if (cached) { - const prior = cached.get(row.id); - if (prior === undefined) { - changes.push({ row, operation: "insert" }); - } else if (updatedAtMs > prior) { + // ClickHouse resolves the (possibly stale) membership; Postgres hydrates the + // authoritative current rows, so status is always fresh even if CH lags. We + // refetch on every wake AND on the final timeout, so a wake missed during the + // brief re-subscribe gap is still caught by the backstop. + const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); + + // Diff against what the client already has, using the hydrated updatedAt: + // prior working set => per-row (new = insert, advanced = update); miss => + // anything newer than the offset floor as a merge-safe update. + const changes: RowChange[] = []; + const seen: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + seen.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (prevSeen) { + const prior = prevSeen.get(row.id); + if (prior === undefined) { + changes.push({ row, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ row, operation: "update" }); + } + } else if (updatedAtMs > offsetFloorMs) { changes.push({ row, operation: "update" }); } - } else if (updatedAtMs > offsetFloorMs) { - changes.push({ row, operation: "update" }); } - } - - // Refresh the working set so runs that left the filter stop being tracked - // (the client keeps showing them; the SDK never applies deletes). - this.#workingSetCache.set(handle, seen); - const body = changes.length === 0 ? buildUpToDateBody() : buildRowsBody(changes, skipColumns); + // Refresh the working set so runs that left the filter stop being tracked + // (the client keeps showing them; the SDK never applies deletes). + this.#workingSetCache.set(handle, seen); + prevSeen = seen; + + if (changes.length > 0) { + const seq = this.#nextSeq(); + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + } - return this.#buildResponse(body, apiVersion, clientVersion, { - offset: encodeOffset(maxUpdatedAt, seq), - handle, - cursor: String(seq), - }); + // Empty diff. With hold-on-empty (default) keep waiting until a real delta or + // the budget elapses; otherwise fall back to the legacy per-wake up-to-date. + if (reason === "timeout" || !holdOnEmpty) { + const seq = this.#nextSeq(); + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + } + // reason === "notify" with an empty diff: keep holding (loop, re-subscribe). + } }); } @@ -654,21 +697,33 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { } } - #waitForChange(runId: string, signal?: AbortSignal): Promise { - return this.#waitForSubscription(this.options.notifier.subscribeToRunChanges(runId), signal); + #waitForChange(runId: string, signal?: AbortSignal, timeoutMs?: number): Promise { + return this.#waitForSubscription( + this.options.notifier.subscribeToRunChanges(runId), + signal, + timeoutMs + ); } - #waitForEnvChange(environmentId: string, signal?: AbortSignal): Promise { + #waitForEnvChange( + environmentId: string, + signal?: AbortSignal, + timeoutMs?: number + ): Promise { return this.#waitForSubscription( this.options.notifier.subscribeToEnvChanges(environmentId), - signal + signal, + timeoutMs ); } - /** Race a notifier subscription against the backstop timeout and the abort signal. */ + /** Race a notifier subscription against a timeout (the jittered backstop by default, + * or an explicit remaining budget when a live request holds across wakes) and the + * abort signal. */ async #waitForSubscription( subscription: RunChangeSubscription, - signal?: AbortSignal + signal?: AbortSignal, + timeoutMs?: number ): Promise { if (signal?.aborted) { subscription.unsubscribe(); @@ -682,7 +737,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { return await new Promise((resolve) => { subscription.changed.then(() => resolve("notify")).catch(() => resolve("timeout")); - timer = setTimeout(() => resolve("timeout"), this.#jitteredTimeout()); + timer = setTimeout(() => resolve("timeout"), timeoutMs ?? this.#jitteredTimeout()); if (signal) { onAbort = () => resolve("abort"); diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts index 1b645eb5fb8..7b486b5a1b8 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts @@ -77,6 +77,7 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { runSetResolveCacheMaxEntries: env.REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES, listCacheMaxEntries: env.REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES, runSetCreatedAtBucketMs: env.REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS, + holdOnEmpty: env.REALTIME_NOTIFIER_HOLD_ON_EMPTY === "1", onWakeup: (reason) => wakeups.inc({ reason }), onRunSetResolve: (result) => runSetResolves.inc({ result }), onRunSetQuery: (stage, ms) => runSetQueryMs.observe({ stage }, ms), diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts index 9bc0f69d6e6..45b65732ed8 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -18,6 +18,22 @@ export type RunChangeNotifierOptions = { /** Channel name prefix; the runId is appended inside a hash-tag for slot locality. */ channelPrefix?: string; connectionName?: string; + /** + * Leading-edge throttle (ms) for the high-volume per-env channel: deliver the + * first wake immediately, then at most one more per window while changes keep + * arriving. Bounds the feed-wake rate per env regardless of run throughput. + * Defaults to 100ms. 0 disables coalescing (wake on every message). + */ + envWakeCoalesceWindowMs?: number; + /** + * Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH) instead of classic pub/sub. + * Only valid against a Redis Cluster (the channels are hash-tagged by run/env id, + * so each lands on one shard) and requires the client to be built with + * `clusterOptions.shardedSubscribers: true`. Classic PUBLISH in a cluster + * broadcasts to every node, so sharded pub/sub is what actually distributes the + * load. Defaults to false (classic pub/sub, for single-node / local). + */ + shardedPubSub?: boolean; }; export type RunChangeSubscription = { @@ -27,6 +43,7 @@ export type RunChangeSubscription = { }; const DEFAULT_CHANNEL_PREFIX = "realtime:"; +const DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS = 100; /** * RunChangeNotifier — the single, encapsulated module that carries "run X changed" @@ -44,8 +61,10 @@ const DEFAULT_CHANNEL_PREFIX = "realtime:"; * - `publish` is fire-and-forget and never throws; a dropped publish only costs * latency because the consumer has a timeout backstop. * - * Channels are hash-tagged (`{}`) so a later move to sharded - * pub/sub (SPUBLISH/SSUBSCRIBE) keeps slot locality without a channel rename. + * Channels are hash-tagged (`{}` / `env:{}`) so they + * land on a single cluster slot. With `shardedPubSub` (cluster only) the feed uses + * SSUBSCRIBE/SPUBLISH so each run/env's traffic stays on one shard rather than + * broadcasting cluster-wide; classic pub/sub is used single-node. */ export class RunChangeNotifier { #publisher: RedisClient | undefined; @@ -53,10 +72,19 @@ export class RunChangeNotifier { readonly #listeners = new Map void>>(); readonly #channelPrefix: string; readonly #connectionName: string; + readonly #coalesceWindowMs: number; + /** When true, use sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) — see options. */ + readonly #sharded: boolean; + /** Active coalescing windows per channel (env channels only). */ + readonly #coalesceTimers = new Map>(); + /** Channels that received a message while their window was open (need a trailing wake). */ + readonly #coalesceDirty = new Set(); constructor(private readonly options: RunChangeNotifierOptions) { this.#channelPrefix = options.channelPrefix ?? DEFAULT_CHANNEL_PREFIX; this.#connectionName = options.connectionName ?? "trigger:realtime:run-change-notifier"; + this.#coalesceWindowMs = options.envWakeCoalesceWindowMs ?? DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS; + this.#sharded = options.shardedPubSub ?? false; } /** @@ -75,7 +103,11 @@ export class RunChangeNotifier { #publishToChannel(channel: string, payload: string): void { try { const publisher = this.#ensurePublisher(); - const result = publisher.publish(channel, payload); + // Sharded pub/sub (SPUBLISH) routes to the channel's slot owner; classic + // PUBLISH broadcasts cluster-wide. The channel is hash-tagged by run/env id. + const result = this.#sharded + ? publisher.spublish(channel, payload) + : publisher.publish(channel, payload); if (typeof (result as Promise)?.catch === "function") { (result as Promise).catch((error) => { logger.error("[runChangeNotifier] Failed to publish run-changed notification", { @@ -130,7 +162,7 @@ export class RunChangeNotifier { if (!listeners) { listeners = new Set(); this.#listeners.set(channel, listeners); - subscriber.subscribe(channel).catch((error) => { + this.#subscribeChannel(subscriber, channel).catch((error) => { logger.error("[runChangeNotifier] Failed to subscribe to run-change channel", { error, channel, @@ -156,8 +188,7 @@ export class RunChangeNotifier { // only if no new listener re-subscribed while it was in flight. The map // entry's existence mirrors "subscribed (or subscribe in flight) in Redis", // so the subscribe path safely reuses it without a duplicate SUBSCRIBE. - subscriber - .unsubscribe(channel) + this.#unsubscribeChannel(subscriber, channel) .then(() => { const latest = this.#listeners.get(channel); if (!latest) { @@ -169,7 +200,7 @@ export class RunChangeNotifier { // A listener arrived during the in-flight UNSUBSCRIBE; the channel is // now unsubscribed in Redis but has live waiters. Re-subscribe so they // still receive messages (the long-poll backstop covers the gap). - subscriber.subscribe(channel).catch((error) => { + this.#subscribeChannel(subscriber, channel).catch((error) => { logger.error("[runChangeNotifier] Failed to re-subscribe to run-change channel", { error, channel, @@ -198,6 +229,11 @@ export class RunChangeNotifier { } async quit(): Promise { + for (const timer of this.#coalesceTimers.values()) { + clearTimeout(timer); + } + this.#coalesceTimers.clear(); + this.#coalesceDirty.clear(); await Promise.allSettled([this.#subscriber?.quit(), this.#publisher?.quit()]); this.#subscriber = undefined; this.#publisher = undefined; @@ -214,13 +250,38 @@ export class RunChangeNotifier { #ensureSubscriber(): RedisClient { if (!this.#subscriber) { const subscriber = createRedisClient(`${this.#connectionName}:sub`, this.options.redis); - subscriber.on("message", (channel: string) => this.#onMessage(channel)); + const onMessage = (channel: string) => this.#onMessage(channel); + // Classic pub/sub delivers "message"; sharded pub/sub delivers "smessage". + // Register both so the delivery path is identical regardless of mode. + subscriber.on("message", onMessage); + subscriber.on("smessage", onMessage); this.#subscriber = subscriber; } return this.#subscriber; } + /** SUBSCRIBE (classic) vs SSUBSCRIBE (sharded, cluster-only). */ + #subscribeChannel(subscriber: RedisClient, channel: string): Promise { + return this.#sharded ? subscriber.ssubscribe(channel) : subscriber.subscribe(channel); + } + + /** UNSUBSCRIBE (classic) vs SUNSUBSCRIBE (sharded, cluster-only). */ + #unsubscribeChannel(subscriber: RedisClient, channel: string): Promise { + return this.#sharded ? subscriber.sunsubscribe(channel) : subscriber.unsubscribe(channel); + } + #onMessage(channel: string) { + // The per-env channel carries a busy environment's entire run-change firehose to + // every tag/batch feed, so throttle it; the per-run channel is low-volume and + // latency-sensitive, so deliver it immediately. + if (this.#coalesceWindowMs > 0 && this.#isEnvChannel(channel)) { + this.#deliverCoalesced(channel); + return; + } + this.#deliver(channel); + } + + #deliver(channel: string) { const listeners = this.#listeners.get(channel); if (!listeners) { return; @@ -231,8 +292,41 @@ export class RunChangeNotifier { } } - // Channels are hash-tagged (`...{}`) so a later move to sharded pub/sub - // keeps slot locality without a rename. + /** + * Leading-edge throttle: deliver the first wake immediately, then suppress further + * wakes for the window, delivering one trailing wake if any messages arrived during + * it (and re-opening while activity continues). Caps the feed-wake rate per env to + * ~1/window no matter how fast runs change. Lossless: consumers refetch current + * state on a wake, so a coalesced burst is captured by the next refetch. + */ + #deliverCoalesced(channel: string) { + if (this.#coalesceTimers.has(channel)) { + this.#coalesceDirty.add(channel); + return; + } + this.#deliver(channel); + this.#openCoalesceWindow(channel); + } + + #openCoalesceWindow(channel: string) { + const timer = setTimeout(() => { + this.#coalesceTimers.delete(channel); + if (this.#coalesceDirty.delete(channel)) { + this.#deliver(channel); + this.#openCoalesceWindow(channel); + } + }, this.#coalesceWindowMs); + // Don't let a pending coalescing window hold the process open at shutdown. + timer.unref?.(); + this.#coalesceTimers.set(channel, timer); + } + + #isEnvChannel(channel: string): boolean { + return channel.startsWith(`${this.#channelPrefix}env:`); + } + + // Channels are hash-tagged (`...{}`) so all of a run's/env's traffic maps to + // one cluster slot (one shard) under sharded pub/sub. #channelForRun(runId: string): string { return `${this.#channelPrefix}run:{${runId}}`; } diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index 78f68537c70..c49f7706042 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -19,6 +19,11 @@ import { const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; function initializeRunChangeNotifier(): RunChangeNotifier { + const clusterMode = env.REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1"; + // Sharded pub/sub only works against a cluster; classic pub/sub there would + // broadcast every message to every node, so this is what actually shards load. + const shardedPubSub = clusterMode && env.REALTIME_RUNS_PUBSUB_REDIS_SHARDED_ENABLED === "1"; + const notifier = new RunChangeNotifier({ redis: { host: env.REALTIME_RUNS_PUBSUB_REDIS_HOST, @@ -26,8 +31,12 @@ function initializeRunChangeNotifier(): RunChangeNotifier { username: env.REALTIME_RUNS_PUBSUB_REDIS_USERNAME, password: env.REALTIME_RUNS_PUBSUB_REDIS_PASSWORD, tlsDisabled: env.REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED === "true", - clusterMode: env.REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", + clusterMode, + // One subscriber connection per shard so SSUBSCRIBE routes to the slot owner. + ...(shardedPubSub ? { clusterOptions: { shardedSubscribers: true } } : {}), }, + envWakeCoalesceWindowMs: env.REALTIME_NOTIFIER_ENV_WAKE_COALESCE_WINDOW_MS, + shardedPubSub, }); new Gauge({ diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 162a9ede9a0..efebaf48207 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -163,7 +163,7 @@ "humanize-duration": "^3.27.3", "input-otp": "^1.4.2", "intl-parse-accept-language": "^1.0.0", - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "isbot": "^3.6.5", "jose": "^5.4.0", "json-stable-stringify": "^1.3.0", diff --git a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts new file mode 100644 index 00000000000..d429cb3f8de --- /dev/null +++ b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts @@ -0,0 +1,136 @@ +import { CURRENT_API_VERSION } from "~/api/versions"; +import { + NotifierRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/notifierRealtimeClient.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { describe, expect, it, vi } from "vitest"; + +const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + +// Fixed offset floor so a row's updatedAt being above/below it deterministically +// produces a delta / empty diff. +const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + +function row(id: string, updatedAtMs: number): RealtimeRunRow { + return { + id, + createdAt: new Date("2026-06-07T09:00:00.000Z"), + updatedAt: new Date(updatedAtMs), + } as unknown as RealtimeRunRow; +} + +/** A notifier whose env wakes are driven manually via wake(). Each live-poll loop + * iteration subscribes once (one-shot), so wake() releases exactly one iteration. */ +function controllableNotifier() { + const pending: Array<() => void> = []; + return { + subscribeToRunChanges: () => ({ changed: new Promise(() => {}), unsubscribe() {} }), + subscribeToEnvChanges: () => { + let resolve!: () => void; + const changed = new Promise((r) => { + resolve = r; + }); + pending.push(resolve); + return { changed, unsubscribe() {} }; + }, + wake() { + pending.shift()?.(); + }, + pending() { + return pending.length; + }, + }; +} + +function makeClient(notifier: unknown, overrides: Record = {}) { + let rowsToReturn: RealtimeRunRow[] = []; + const hydrateSpy = vi.fn(async () => rowsToReturn); + + const client = new NotifierRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: async () => ["run_1"] } as any, + notifier: notifier as any, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + // Disable the resolve cache so each held iteration re-hydrates the latest rows. + runSetResolveCacheTtlMs: 0, + livePollTimeoutMs: 10_000, + ...overrides, + }); + + return { client, hydrateSpy, setRows: (rows: RealtimeRunRow[]) => (rowsToReturn = rows) }; +} + +function liveRuns(client: NotifierRealtimeClient) { + return client.streamRuns( + `http://localhost:3030/realtime/v1/runs?offset=${FLOOR_MS}_1&live=true&handle=runs_${FLOOR_MS}_7`, + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); +} + +async function bodyOf(res: Response) { + return JSON.parse(await res.text()) as Array<{ headers?: { control?: string; operation?: string }; value?: unknown }>; +} +const hasRowOp = (body: Awaited>) => + body.some((m) => m?.headers?.operation || (m && typeof m === "object" && "value" in m)); +const isUpToDate = (body: Awaited>) => + body.some((m) => m?.headers?.control === "up-to-date"); + +describe("NotifierRealtimeClient lever A (hold-on-empty)", () => { + it("holds the long-poll on an empty diff and only responds when a real delta arrives", async () => { + const notifier = controllableNotifier(); + const { client, hydrateSpy, setRows } = makeClient(notifier); + setRows([row("run_1", FLOOR_MS - 1_000)]); // older than the floor -> empty diff + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + + // Feed subscribed and is waiting. + await vi.waitFor(() => expect(notifier.pending()).toBe(1)); + + // An irrelevant change wakes the env channel, but this feed's diff is empty. + notifier.wake(); + // It must re-subscribe and keep holding (no response yet), having refetched once. + await vi.waitFor(() => expect(notifier.pending()).toBe(1)); + expect(settled).toBe(false); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + + // A relevant change: a row advances past the floor. + setRows([row("run_1", FLOOR_MS + 5_000)]); + notifier.wake(); + + const res = await responsePromise; + expect(settled).toBe(true); + expect(res.status).toBe(200); + expect(hasRowOp(await bodyOf(res))).toBe(true); + }); + + it("returns up-to-date once the backstop elapses with no relevant change", async () => { + const notifier = controllableNotifier(); + const { client } = makeClient(notifier, { livePollTimeoutMs: 50 }); + // No rows ever match; never wake -> the backstop fires and we return up-to-date. + const res = await liveRuns(client); + expect(res.status).toBe(200); + expect(isUpToDate(await bodyOf(res))).toBe(true); + }); + + it("with holdOnEmpty=false, returns up-to-date on the first empty wake (legacy behavior)", async () => { + const notifier = controllableNotifier(); + const { client } = makeClient(notifier, { holdOnEmpty: false }); + + const responsePromise = liveRuns(client); + await vi.waitFor(() => expect(notifier.pending()).toBe(1)); + notifier.wake(); // empty diff -> legacy path returns immediately + + const res = await responsePromise; + expect(res.status).toBe(200); + expect(isUpToDate(await bodyOf(res))).toBe(true); + }); +}); diff --git a/apps/webapp/test/realtime/runChangeNotifier.test.ts b/apps/webapp/test/realtime/runChangeNotifier.test.ts index 7459c9f5df5..b6c43e05544 100644 --- a/apps/webapp/test/realtime/runChangeNotifier.test.ts +++ b/apps/webapp/test/realtime/runChangeNotifier.test.ts @@ -208,4 +208,109 @@ describe("RunChangeNotifier", () => { } } ); + + redisTest( + "coalesces a burst of env publishes into far fewer wakes than publishes", + { timeout: 30_000 }, + async ({ redisOptions }) => { + // A busy env's run-change firehose must not wake feeds once per publication. + const notifier = new RunChangeNotifier({ + redis: toRedisOptions(redisOptions), + envWakeCoalesceWindowMs: 100, + }); + try { + // Count wakes by continuously re-subscribing (each subscription is one-shot). + let wakes = 0; + let running = true; + const counter = (async () => { + while (running) { + const sub = notifier.subscribeToEnvChanges("env_burst"); + let woke = false; + void sub.changed.then(() => (woke = true)).catch(() => {}); + const start = Date.now(); + while (!woke && running && Date.now() - start < 1_500) { + await sleep(5); + } + sub.unsubscribe(); + if (woke) wakes++; + else break; + } + })(); + + await sleep(SUBSCRIBE_SETTLE_MS); + // Publish ~200/s for a second to the same env channel. + let pubs = 0; + const end = Date.now() + 1_000; + while (Date.now() < end) { + notifier.publish({ runId: `r${pubs++}`, environmentId: "env_burst" }); + await sleep(5); + } + running = false; + await counter; + + expect(pubs).toBeGreaterThan(100); + expect(wakes).toBeGreaterThanOrEqual(1); // leading edge still delivers + // Leading-edge throttle caps wakes to ~time/window, well below the publish count. + expect(wakes).toBeLessThan(pubs / 4); + } finally { + await notifier.quit(); + } + } + ); + + // Sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) wiring — validated end to end on a + // single node (Redis 7.2 accepts these commands and delivers same-node). Multi-shard + // ROUTING needs a real cluster (covered by the cluster fixture), but this proves the + // notifier's sharded command + event path is correct. + redisTest( + "delivers via sharded pub/sub on the per-run channel", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ + redis: toRedisOptions(redisOptions), + shardedPubSub: true, + }); + try { + const subscription = notifier.subscribeToRunChanges("run_sharded"); + let resolved = false; + void subscription.changed.then(() => { + resolved = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_sharded" }); + + await vi.waitFor(() => expect(resolved).toBe(true), { timeout: 5_000, interval: 50 }); + subscription.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); + + redisTest( + "delivers via sharded pub/sub on the per-env channel", + { timeout: 30_000 }, + async ({ redisOptions }) => { + const notifier = new RunChangeNotifier({ + redis: toRedisOptions(redisOptions), + shardedPubSub: true, + }); + try { + const envSub = notifier.subscribeToEnvChanges("env_sharded"); + let envWoke = false; + void envSub.changed.then(() => { + envWoke = true; + }); + + await sleep(SUBSCRIBE_SETTLE_MS); + notifier.publish({ runId: "run_1", environmentId: "env_sharded" }); + + await vi.waitFor(() => expect(envWoke).toBe(true), { timeout: 5_000, interval: 50 }); + envSub.unsubscribe(); + } finally { + await notifier.quit(); + } + } + ); }); diff --git a/internal-packages/redis/package.json b/internal-packages/redis/package.json index 9c13bbf21b0..6c7d8aa2608 100644 --- a/internal-packages/redis/package.json +++ b/internal-packages/redis/package.json @@ -6,7 +6,7 @@ "types": "./src/index.ts", "type": "module", "dependencies": { - "ioredis": "^5.3.2", + "ioredis": "~5.6.0", "@trigger.dev/core": "workspace:*" }, "scripts": { diff --git a/internal-packages/testcontainers/package.json b/internal-packages/testcontainers/package.json index 4ea83344c34..b3ab7ce5dc4 100644 --- a/internal-packages/testcontainers/package.json +++ b/internal-packages/testcontainers/package.json @@ -16,7 +16,7 @@ "@clickhouse/client": "^1.11.1", "@opentelemetry/api": "^1.9.1", "@trigger.dev/database": "workspace:*", - "ioredis": "^5.3.2" + "ioredis": "~5.6.0" }, "devDependencies": { "@testcontainers/postgresql": "^11.14.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 782b62cf7ff..39273b2976c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -228,8 +228,8 @@ importers: specifier: ^4.0.6 version: 4.0.6 ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 p-limit: specifier: ^6.2.0 version: 6.2.0 @@ -664,8 +664,8 @@ importers: specifier: ^1.0.0 version: 1.0.0 ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 isbot: specifier: ^3.6.5 version: 3.6.5 @@ -1256,8 +1256,8 @@ importers: specifier: workspace:* version: link:../../packages/core ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 internal-packages/replication: dependencies: @@ -1404,8 +1404,8 @@ importers: specifier: workspace:* version: link:../database ioredis: - specifier: ^5.3.2 - version: 5.3.2 + specifier: ~5.6.0 + version: 5.6.1 devDependencies: '@testcontainers/postgresql': specifier: ^11.14.0 @@ -11970,8 +11970,8 @@ packages: resolution: {integrity: sha512-YFMSV91JNBOSjw1cOfw2tup6hDP7mkz+2AUV7W1L1AM6ntgI75qC1ZeFpjPGMrWp+upmBRTX2fJWQ8c7jsUWpA==} engines: {node: '>=14'} - ioredis@5.3.2: - resolution: {integrity: sha512-1DKMMzlIHM02eBBVOFQ1+AolGjs6+xEcM4PDL7NqOS6szq7H9jSaEkIUH6/a5Hl241LzW6JLSiAbNvTQjUupUA==} + ioredis@5.6.1: + resolution: {integrity: sha512-UxC0Yv1Y4WRJiGQxQkP0hfdL0/5/6YvdfOOClRgJ0qppSarkhneSa6UvkMkms0AkdGimSH3Ikqm+6mkMmX7vGA==} engines: {node: '>=12.22.0'} ip-address@10.0.1: @@ -30048,11 +30048,11 @@ snapshots: intl-parse-accept-language@1.0.0: {} - ioredis@5.3.2: + ioredis@5.6.1: dependencies: '@ioredis/commands': 1.2.0 cluster-key-slot: 1.1.2 - debug: 4.3.7(supports-color@10.0.0) + debug: 4.4.3(supports-color@10.0.0) denque: 2.1.0 lodash.defaults: 4.2.0 lodash.isarguments: 3.1.0 @@ -33909,7 +33909,7 @@ snapshots: send@1.1.0(supports-color@10.0.0): dependencies: - debug: 4.3.6(supports-color@10.0.0) + debug: 4.4.3(supports-color@10.0.0) destroy: 1.2.0 encodeurl: 2.0.0 escape-html: 1.0.3 From 2a1e927eb04e0246d00268a48881ef5f1040ef4a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 9 Jun 2026 18:23:30 +0100 Subject: [PATCH 10/23] refactor(webapp,run-engine): route the realtime runs feed through one per-env change channel Rework the new realtime runs backend (still behind its feature flag, with the existing backend as the default) so every feed is a predicate over one self-describing change record, published once per run change to a single per-environment channel. A per-instance router indexes the currently-held feeds by run, tag, and batch. On a change it hydrates the affected runs once and serializes them once, then fans the result to every matching feed, so one hot shared tag costs a single query and serialize no matter how many feeds watch it. Newly triggered runs surface immediately (the run-created event now carries tags and batch) rather than waiting for a status change. An admission gate bounds how many cold ClickHouse resolves run concurrently, so a mass reconnect across distinct filters queues instead of stampeding the database. Live long-polls hold for about 20 seconds to match the existing backend cadence. --- apps/webapp/app/env.server.ts | 11 +- .../app/models/runtimeEnvironment.server.ts | 21 +- .../app/routes/api.v1.runs.$runId.metadata.ts | 6 + .../app/routes/api.v1.runs.$runId.tags.ts | 11 +- .../realtime/electricStreamProtocol.server.ts | 20 + .../realtime/envChangeRouter.server.ts | 347 +++++++++++ .../realtime/notifierRealtimeClient.server.ts | 559 ++++++++++++------ .../notifierRealtimeClientInstance.server.ts | 48 +- .../realtime/runChangeNotifier.server.ts | 285 +++++---- .../runChangeNotifierHandlers.server.ts | 111 ++-- .../runChangeNotifierInstance.server.ts | 21 +- .../webapp/app/v3/runEngineHandlers.server.ts | 77 ++- .../test/realtime/envChangeRouter.test.ts | 187 ++++++ .../test/realtime/notifierHoldOnEmpty.test.ts | 171 ++++-- .../realtime/notifierRealtimeClient.test.ts | 11 +- .../test/realtime/notifierRunSetCache.test.ts | 95 ++- .../test/realtime/runChangeNotifier.test.ts | 292 +++------ .../run-engine/src/engine/eventBus.ts | 21 +- .../run-engine/src/engine/index.ts | 9 +- .../src/engine/systems/checkpointSystem.ts | 6 + .../src/engine/systems/delayedRunSystem.ts | 4 + .../src/engine/systems/dequeueSystem.ts | 6 + .../engine/systems/pendingVersionSystem.ts | 2 + .../src/engine/systems/runAttemptSystem.ts | 4 + 24 files changed, 1648 insertions(+), 677 deletions(-) create mode 100644 apps/webapp/app/services/realtime/envChangeRouter.server.ts create mode 100644 apps/webapp/test/realtime/envChangeRouter.test.ts diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 265dc3497f0..f01e8285916 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -306,8 +306,10 @@ const EnvironmentSchema = z // "1" = run-changed signals are published and the per-org `realtimeBackend` // feature flag selects the backend per request. REALTIME_NOTIFIER_ENABLED: z.string().default("0"), - // Backstop wait before a live notifier request refetches the run (ms). - REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(5_000), + // Backstop wait before a live notifier request refetches the run (ms). Matches + // Electric's ~20s live long-poll hold so the client polling cadence is unchanged + // across backends (a ±15% jitter is applied per request to avoid refetch herds). + REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(20_000), // Hard cap on the tag-list snapshot size served by the notifier feed. REALTIME_NOTIFIER_MAX_LIST_RESULTS: z.coerce.number().int().default(1_000), // Short-TTL coalescing cache for the multi-run (tag-list/batch) resolve+hydrate. @@ -333,6 +335,11 @@ const EnvironmentSchema = z // holding the long-poll (re-resolving cheaply) instead of returning an empty // up-to-date the client would immediately re-issue. "0" reverts to per-wake replies. REALTIME_NOTIFIER_HOLD_ON_EMPTY: z.string().default("1"), + // Max concurrent fresh ClickHouse resolves (cache misses) per instance. Caps the + // distinct-filter reconnect stampede: a mass reconnect of N feeds on N different filters + // queues to this many concurrent CH queries instead of firing all N at once. Same-filter + // bursts collapse via the single-flight cache before taking a permit. 0 disables the gate. + REALTIME_NOTIFIER_RESOLVE_ADMISSION_LIMIT: z.coerce.number().int().default(16), PUBSUB_REDIS_HOST: z .string() diff --git a/apps/webapp/app/models/runtimeEnvironment.server.ts b/apps/webapp/app/models/runtimeEnvironment.server.ts index 64b1da3be49..be05adaa8a7 100644 --- a/apps/webapp/app/models/runtimeEnvironment.server.ts +++ b/apps/webapp/app/models/runtimeEnvironment.server.ts @@ -237,10 +237,20 @@ export async function findEnvironmentBySlug( return environment ? toAuthenticated(environment) : null; } +// The authenticated environment plus the run scalars the realtime publish needs. +// Both come from one taskRun read — see findEnvironmentFromRun. +export type EnvironmentFromRun = { + environment: AuthenticatedEnvironment; + runTags: string[]; + batchId: string | null; +}; + export async function findEnvironmentFromRun( runId: string, tx?: PrismaClientOrTransaction -): Promise { +): Promise { + // The include (no select) already pulls every taskRun scalar, so runTags/batchId + // ride along for free — no extra query for the realtime publish to send a full record. const taskRun = await (tx ?? $replica).taskRun.findFirst({ where: { id: runId, @@ -249,7 +259,14 @@ export async function findEnvironmentFromRun( runtimeEnvironment: { include: authIncludeBase }, }, }); - return taskRun?.runtimeEnvironment ? toAuthenticated(taskRun.runtimeEnvironment) : null; + if (!taskRun?.runtimeEnvironment) { + return null; + } + return { + environment: toAuthenticated(taskRun.runtimeEnvironment), + runTags: taskRun.runTags, + batchId: taskRun.batchId, + }; } export async function createNewSession( diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index ceae1efb4b4..65cbd29c627 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -12,6 +12,7 @@ import type { AuthenticatedEnvironment } from "~/services/apiAuth.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { logger } from "~/services/logger.server"; import { updateMetadataService } from "~/services/metadata/updateMetadataInstance.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; import { ServiceValidationError } from "~/v3/services/common.server"; import { applyMetadataMutationToBufferedRun } from "~/v3/mollifier/applyMetadataMutation.server"; @@ -184,6 +185,11 @@ const { action } = createActionApiRoute( return json({ error: "Internal Server Error" }, { status: 500 }); } if (pgResult) { + // Mid-run metadata flush succeeded: publish a run-changed record so a live single-run + // feed reflects metadata.set() without waiting for the next lifecycle event (this + // path doesn't otherwise touch the engine event bus). envId is free; partial record, + // matched by runId. No-op when disabled. + publishChangeRecord({ runId, envId: env.id }); return json(pgResult, { status: 200 }); } diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index 9dd184fa25e..c8fa5ea37d2 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -7,7 +7,7 @@ import { MAX_TAGS_PER_RUN } from "~/models/taskRunTag.server"; import { authenticateApiRequest } from "~/services/apiAuth.server"; import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; import { logger } from "~/services/logger.server"; -import { publishRunChanged } from "~/services/realtime/runChangeNotifierInstance.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { mutateWithFallback } from "~/v3/mollifier/mutateWithFallback.server"; // Pull the existing tags out of a buffer entry's serialised payload so @@ -91,8 +91,13 @@ export async function action({ request, params }: ActionFunctionArgs) { }, data: { runTags: { push: newTags } }, }); - // Delegate a run-changed notify (no-op unless enabled). - publishRunChanged({ runId: taskRun.id, environmentId: env.id }); + // Publish a run-changed record with the NEW tag set so tag feeds reindex + // (no-op unless enabled). + publishChangeRecord({ + runId: taskRun.id, + envId: env.id, + tags: existing.concat(newTags), + }); return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, // Buffer-applied patch path. The mutateSnapshot Lua deduplicates diff --git a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts index c7c90a7f17b..6a276bcb03d 100644 --- a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts +++ b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts @@ -273,6 +273,26 @@ export function buildRowsBody(changes: RowChange[], skipColumns: string[] = []): return JSON.stringify(messages); } +/** A row change whose wire `value` was already serialized (once, shared across feeds by + * the EnvChangeRouter); the per-feed `operation` is applied here. */ +export type SerializedRowChange = { + runId: string; + value: Record; + operation: "insert" | "update"; +}; + +/** Like `buildRowsBody`, but from values serialized once per (runId, columnSet) upstream, + * so a run matching many feeds is serialized once and reused across their bodies. */ +export function buildRowsBodyFromSerialized(changes: SerializedRowChange[]): string { + const messages: ShapeMessage[] = changes.map((change) => ({ + key: runShapeKey(change.runId), + value: change.value, + headers: { operation: change.operation }, + })); + messages.push(UP_TO_DATE); + return JSON.stringify(messages); +} + export const INITIAL_OFFSET = "-1"; /** diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts new file mode 100644 index 00000000000..0c68140e58b --- /dev/null +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -0,0 +1,347 @@ +import { type ChangeRecord } from "./runChangeNotifier.server"; +import { type RealtimeRunRow, serializeRunRow } from "./electricStreamProtocol.server"; + +/** + * EnvChangeRouter — the per-instance routing layer that turns "feeds as predicates over + * one env stream" into cheap fan-out. + * + * It owns ONE subscription per environment (over the RunChangeNotifier) and an inverted + * index of the feeds currently held by THIS instance: `runId -> feeds`, `tag -> feeds`, + * `batchId -> feeds`. On a coalesced batch of ChangeRecords it: + * 1. routes each record to only the matching held feeds via the index (O(record-tags), + * not O(feeds)) — a record that matches nothing costs nothing; + * 2. batch-hydrates the matched runs from Postgres ONCE per column set (collapsing the + * hot-shared-tag fan-out: one run matching N feeds = one `hydrateByIds`, not N); + * 3. serializes each row's wire value ONCE per column set, reused across all matching + * feeds; + * 4. resolves each matching feed's pending wait with its hydrated+serialized rows. + * + * It is stateless across reconnects: the index is rebuilt from whatever feeds this + * instance happens to hold, so no shape affinity or cross-poll memory is required. The + * per-handle working-set diff (insert vs update) stays in the consumer; the router only + * decides membership, hydrates, and serializes. + */ + +export type WakeReason = "notify" | "timeout" | "abort"; + +/** A feed's membership predicate over the env stream. */ +export type FeedFilter = + | { kind: "run"; runId: string } + | { kind: "tag"; tags: string[]; createdAtFloorMs?: number } + | { kind: "batch"; batchId: string }; + +/** A matched run handed to a feed: the hydrated row (for the feed's working-set diff) and + * its wire `value` serialized once for this feed's column set (shared across feeds). */ +export type MatchedRow = { row: RealtimeRunRow; value: Record }; + +export type WaitResult = { reason: WakeReason; rows: MatchedRow[] }; + +/** Minimal deps so the router is unit-testable without Redis/Postgres. */ +export interface EnvChangeSource { + subscribeToEnv(environmentId: string, onBatch: (records: ChangeRecord[]) => void): () => void; +} +export interface RowHydrator { + hydrateByIds( + environmentId: string, + ids: string[], + skipColumns: string[] + ): Promise; +} + +export type EnvChangeRouterOptions = { + source: EnvChangeSource; + hydrator: RowHydrator; + /** Observability: a hydrate-by-id batch ran (count = runs hydrated this tick). */ + onHydrate?: (runCount: number) => void; +}; + +/** Handle a feed holds for the duration of one long-poll. */ +export type FeedRegistration = { + /** Wait for the next batch matching this feed (or timeout/abort), with the matched runs + * hydrated + serialized for this feed's columns. One wait active at a time. */ + waitForMatch(signal: AbortSignal | undefined, timeoutMs: number): Promise; + /** Deregister from the index; unsubscribes the env when the last feed leaves. */ + close(): void; +}; + +type Feed = { + filter: FeedFilter; + skipColumns: string[]; + columnSig: string; + /** The currently-waiting poll's resolver (null between polls). */ + resolve: ((result: WaitResult) => void) | null; +}; + +type EnvState = { + unsubscribe: () => void; + feeds: Set; + byRunId: Map>; + byTag: Map>; + byBatchId: Map>; + /** All tag feeds, for routing partial records (no tags) as hydrate-to-classify candidates. */ + tagFeeds: Set; +}; + +function addToIndex(index: Map>, key: string, feed: Feed) { + let set = index.get(key); + if (!set) { + set = new Set(); + index.set(key, set); + } + set.add(feed); +} + +function removeFromIndex(index: Map>, key: string, feed: Feed) { + const set = index.get(key); + if (set) { + set.delete(feed); + if (set.size === 0) { + index.delete(key); + } + } +} + +export class EnvChangeRouter { + readonly #envs = new Map(); + + constructor(private readonly options: EnvChangeRouterOptions) {} + + register(environmentId: string, filter: FeedFilter, skipColumns: string[]): FeedRegistration { + const env = this.#ensureEnv(environmentId); + const feed: Feed = { + filter, + skipColumns, + columnSig: skipColumns.length > 0 ? [...skipColumns].sort().join(",") : "", + resolve: null, + }; + + env.feeds.add(feed); + this.#indexFeed(env, feed); + + const waitForMatch = (signal: AbortSignal | undefined, timeoutMs: number) => + new Promise((resolve) => { + if (signal?.aborted) { + resolve({ reason: "abort", rows: [] }); + return; + } + let settled = false; + let timer: ReturnType | undefined; + let onAbort: (() => void) | undefined; + const settle = (result: WaitResult) => { + if (settled) return; + settled = true; + feed.resolve = null; + if (timer) clearTimeout(timer); + if (signal && onAbort) signal.removeEventListener("abort", onAbort); + resolve(result); + }; + feed.resolve = settle; + timer = setTimeout(() => settle({ reason: "timeout", rows: [] }), timeoutMs); + timer.unref?.(); + if (signal) { + onAbort = () => settle({ reason: "abort", rows: [] }); + signal.addEventListener("abort", onAbort, { once: true }); + } + }); + + const close = () => { + if (!env.feeds.has(feed)) { + return; + } + env.feeds.delete(feed); + this.#deindexFeed(env, feed); + // Resolve any in-flight wait so the poll doesn't hang. + feed.resolve?.({ reason: "abort", rows: [] }); + feed.resolve = null; + if (env.feeds.size === 0) { + this.#envs.delete(environmentId); + env.unsubscribe(); + } + }; + + return { waitForMatch, close }; + } + + /** Distinct environments currently routed (for metrics). */ + get activeEnvCount(): number { + return this.#envs.size; + } + + #ensureEnv(environmentId: string): EnvState { + const existing = this.#envs.get(environmentId); + if (existing) { + return existing; + } + const env: EnvState = { + unsubscribe: () => {}, + feeds: new Set(), + byRunId: new Map(), + byTag: new Map(), + byBatchId: new Map(), + tagFeeds: new Set(), + }; + this.#envs.set(environmentId, env); + env.unsubscribe = this.options.source.subscribeToEnv(environmentId, (records) => { + // Fire-and-forget; the notifier doesn't await us. Errors fall through to the feeds' + // backstop (a hydrate failure leaves waiters to time out into a full resolve). + void this.#onBatch(environmentId, env, records); + }); + return env; + } + + #indexFeed(env: EnvState, feed: Feed) { + switch (feed.filter.kind) { + case "run": + addToIndex(env.byRunId, feed.filter.runId, feed); + break; + case "batch": + addToIndex(env.byBatchId, feed.filter.batchId, feed); + break; + case "tag": + env.tagFeeds.add(feed); + for (const tag of feed.filter.tags) { + addToIndex(env.byTag, tag, feed); + } + break; + } + } + + #deindexFeed(env: EnvState, feed: Feed) { + switch (feed.filter.kind) { + case "run": + removeFromIndex(env.byRunId, feed.filter.runId, feed); + break; + case "batch": + removeFromIndex(env.byBatchId, feed.filter.batchId, feed); + break; + case "tag": + env.tagFeeds.delete(feed); + for (const tag of feed.filter.tags) { + removeFromIndex(env.byTag, tag, feed); + } + break; + } + } + + async #onBatch(environmentId: string, env: EnvState, records: ChangeRecord[]) { + // 1. Route each record to the held feeds it matches; collect matched runIds per feed. + const matchedRunIdsByFeed = new Map>(); + const addMatch = (feed: Feed, runId: string) => { + if (!feed.resolve) { + // Feed isn't currently waiting (between polls). Drop — its backstop catches gaps. + return; + } + let set = matchedRunIdsByFeed.get(feed); + if (!set) { + set = new Set(); + matchedRunIdsByFeed.set(feed, set); + } + set.add(runId); + }; + + for (const record of records) { + // run feeds: exact runId match. + const runFeeds = env.byRunId.get(record.runId); + if (runFeeds) { + for (const feed of runFeeds) addMatch(feed, record.runId); + } + + // batch feeds: exact batchId match (only when the record carries one). + if (record.batchId) { + const batchFeeds = env.byBatchId.get(record.batchId); + if (batchFeeds) { + for (const feed of batchFeeds) addMatch(feed, record.runId); + } + } + + // tag feeds. + if (record.tags !== undefined) { + // Full record: prune via the tag index; only feeds whose filter intersects match. + const seen = new Set(); + for (const tag of record.tags) { + const tagFeeds = env.byTag.get(tag); + if (!tagFeeds) continue; + for (const feed of tagFeeds) { + if (seen.has(feed)) continue; + seen.add(feed); + addMatch(feed, record.runId); + } + } + } else { + // Partial record (no membership data): route to every tag feed as a candidate to + // hydrate-and-classify (rare; the publish side emits full records in practice). + for (const feed of env.tagFeeds) addMatch(feed, record.runId); + } + } + + if (matchedRunIdsByFeed.size === 0) { + return; + } + + // 2. Batch-hydrate ONCE per column set, then 3. serialize ONCE per (runId, column set). + const runIdsByColumnSig = new Map }>(); + for (const [feed, runIds] of matchedRunIdsByFeed) { + let group = runIdsByColumnSig.get(feed.columnSig); + if (!group) { + group = { skipColumns: feed.skipColumns, runIds: new Set() }; + runIdsByColumnSig.set(feed.columnSig, group); + } + for (const id of runIds) group.runIds.add(id); + } + + const hydratedByColumnSig = new Map>(); + await Promise.all( + [...runIdsByColumnSig.entries()].map(async ([columnSig, group]) => { + const ids = [...group.runIds]; + const rows = await this.options.hydrator.hydrateByIds( + environmentId, + ids, + group.skipColumns + ); + this.options.onHydrate?.(rows.length); + const map = new Map(); + for (const row of rows) { + map.set(row.id, { row, value: serializeRunRow(row, group.skipColumns) }); + } + hydratedByColumnSig.set(columnSig, map); + }) + ); + + // 4. Assemble each feed's matched rows (post-filtering tag feeds against the + // authoritative hydrated row) and resolve its pending wait. + for (const [feed, runIds] of matchedRunIdsByFeed) { + if (!feed.resolve) { + continue; // stopped waiting while we hydrated; its next poll/backstop covers it + } + const hydrated = hydratedByColumnSig.get(feed.columnSig); + if (!hydrated) continue; + + const rows: MatchedRow[] = []; + for (const runId of runIds) { + const matched = hydrated.get(runId); + if (!matched) continue; // run not found / left the table + if (feed.filter.kind === "tag" && !this.#tagRowMatches(matched.row, feed.filter)) { + continue; // re-confirm tags + createdAt floor against the authoritative row + } + rows.push(matched); + } + + if (rows.length > 0) { + feed.resolve({ reason: "notify", rows }); + } + // No surviving rows (e.g. a partial-record candidate that didn't actually match): + // leave the feed waiting; nothing relevant changed for it. + } + } + + /** Authoritative re-check for tag feeds: the hydrated row's tags intersect the filter + * and its createdAt is within the feed's window. Handles partial-record candidates and + * guards record/row tag skew. */ + #tagRowMatches(row: RealtimeRunRow, filter: Extract): boolean { + if (filter.createdAtFloorMs !== undefined && row.createdAt.getTime() < filter.createdAtFloorMs) { + return false; + } + const rowTags = row.runTags ?? []; + return filter.tags.some((tag) => rowTags.includes(tag)); + } +} diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts index b0f6aade879..7a62fd429c9 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -12,6 +12,7 @@ import { logger } from "../logger.server"; import { buildElectricSchemaHeader, buildRowsBody, + buildRowsBodyFromSerialized, buildSnapshotBody, buildUpdateBody, buildUpToDateBody, @@ -22,9 +23,14 @@ import { rewriteBodyForLegacyApiVersion, RESERVED_COLUMNS, type RowChange, + type SerializedRowChange, } from "./electricStreamProtocol.server"; import { BoundedTtlCache } from "./boundedTtlCache"; -import { type RunChangeNotifier, type RunChangeSubscription } from "./runChangeNotifier.server"; +import { + type EnvChangeRouter, + type FeedFilter, + type MatchedRow, +} from "./envChangeRouter.server"; import { type RunHydrator, type RunListResolver } from "./runReader.server"; import { type RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; @@ -68,11 +74,18 @@ export interface RealtimeStreamClient { export type WakeupReason = "notify" | "timeout" | "abort"; +/** How a live poll resolved, for observability: + * - `fast-hydrate`: the router woke this feed with matched rows (hydrated by id, NO + * ClickHouse). Non-matching changes never wake the feed, so they cost nothing. + * - `full-resolve`: the backstop timeout did a ClickHouse resolve (the correctness net). */ +export type LivePollPath = "fast-hydrate" | "full-resolve"; + export type NotifierRealtimeClientOptions = { runReader: RunHydrator; /** Resolves the tag/list filter into the matching id-set (filter-only). */ runListResolver: RunListResolver; - notifier: RunChangeNotifier; + /** Per-instance routing layer over the single env change channel. */ + router: EnvChangeRouter; limiter: RealtimeConcurrencyLimiter; cachedLimitProvider: CachedLimitProvider; /** Backstop wait before refetching on a live request (ms). Defaults to 5000. */ @@ -81,7 +94,7 @@ export type NotifierRealtimeClientOptions = { maximumCreatedAtFilterAgeMs: number; /** Hard cap on tag-list snapshot size. Defaults to 1000. */ maxListResults?: number; - /** TTL (ms) for the multi-run resolve+hydrate coalescing cache. Defaults to 1000. */ + /** TTL (ms) for the multi-run resolve+hydrate coalescing cache (initial + backstop). */ runSetResolveCacheTtlMs?: number; /** Max entries in the resolve+hydrate cache. Defaults to 5000. */ runSetResolveCacheMaxEntries?: number; @@ -91,28 +104,80 @@ export type NotifierRealtimeClientOptions = { * same-tag feeds pinned within the same bucket share a cache entry. Defaults to * 60000. 0 disables bucketing. */ runSetCreatedAtBucketMs?: number; - /** When true (default), a multi-run live poll woken by a change irrelevant to its - * filter keeps holding the long-poll (re-resolving cheaply on each wake) instead of - * returning an empty up-to-date the client would immediately re-issue. The empty - * response is the dominant cost under a busy per-env wake channel. */ + /** When true (default), a multi-run live poll holds the connection until a real delta + * or the backstop, rather than returning an empty up-to-date the client would re-issue. */ holdOnEmpty?: boolean; + /** Max concurrent fresh ClickHouse resolves (cache misses) across this instance. Bounds a + * distinct-filter reconnect stampede so it queues instead of hammering ClickHouse. Defaults + * to 16; 0 disables the gate (unbounded). */ + resolveAdmissionLimit?: number; /** Observability hook: why a live request woke (notify vs timeout vs abort). */ onWakeup?: (reason: WakeupReason) => void; - /** Observability hook: whether a multi-run resolve hit the cache, coalesced onto - * an in-flight resolve, or missed (issued fresh ClickHouse + Postgres queries). */ + /** Observability hook: how a live poll resolved (fast path vs full resolve). */ + onLivePollPath?: (path: LivePollPath) => void; + /** Observability hook: whether a multi-run resolve (initial/backstop) hit the cache, + * coalesced onto an in-flight resolve, or missed (fresh ClickHouse + Postgres). */ onRunSetResolve?: (result: "hit" | "miss" | "coalesced") => void; /** Observability hook: latency (ms) of the ClickHouse resolve / Postgres hydrate. */ onRunSetQuery?: (stage: "resolve" | "hydrate", ms: number) => void; + /** Observability hook: a fresh resolve had to wait `ms` for an admission permit (the gate + * engaged — i.e. a stampede was throttled). Not called when a permit is free. */ + onResolveAdmissionWait?: (ms: number) => void; }; const DEFAULT_CONCURRENCY_LIMIT = 100_000; -const DEFAULT_LIVE_POLL_TIMEOUT_MS = 5_000; +// Matches Electric's ~20s live long-poll hold (jittered ±15% per request). +const DEFAULT_LIVE_POLL_TIMEOUT_MS = 20_000; const DEFAULT_MAX_LIST_RESULTS = 1_000; const LIST_CACHE_TTL_MS = 5 * 60_000; const LIST_CACHE_MAX_ENTRIES = 10_000; const DEFAULT_RUNSET_CACHE_TTL_MS = 1_000; const DEFAULT_RUNSET_CACHE_MAX_ENTRIES = 5_000; const DEFAULT_RUNSET_CREATED_AT_BUCKET_MS = 60_000; +const DEFAULT_RESOLVE_ADMISSION_LIMIT = 16; + +/** + * Fair FIFO semaphore bounding how many fresh ClickHouse resolves run concurrently. It sits + * BEHIND the single-flight + TTL cache, so only genuine cache-miss resolves take a permit: a + * same-filter reconnect stampede still collapses to one in-flight resolve (one permit), while + * a distinct-filter stampede — where every filter is a different cache key and so can't + * coalesce — is throttled to `limit` concurrent CH queries instead of firing all N at the + * database at once. Trades a little connect latency under a stampede for bounded CH load. + */ +class ResolveAdmissionGate { + #available: number; + #inUse = 0; + readonly #waiters: Array<() => void> = []; + + constructor(limit: number) { + this.#available = limit; + } + + /** Permits currently held (for a metrics gauge); never exceeds the limit. */ + get inUse(): number { + return this.#inUse; + } + + async acquire(): Promise { + if (this.#available > 0) { + this.#available--; + this.#inUse++; + return; + } + await new Promise((resolve) => this.#waiters.push(resolve)); + this.#inUse++; + } + + release(): void { + this.#inUse--; + const next = this.#waiters.shift(); + if (next) { + next(); // hand the freed permit straight to the next waiter (FIFO, no count churn) + } else { + this.#available++; + } + } +} /** A multi-run feed's filter. Tag-list sets `tags` (+ pinned `createdAtAfter`); * the batch feed sets `batchId`. Both resolve to an id-set via the resolver. */ @@ -134,37 +199,39 @@ type ResponseHeaderInput = { }; /** - * Notifier-backed implementation of the realtime run feeds: signals run changes - * over Redis pub/sub and refetches the current rows from a read replica. + * Notifier-backed implementation of the realtime run feeds. All three feeds are + * predicates over ONE per-environment change stream (the EnvChangeRouter); the router + * decides membership, hydrates the matched runs from a read replica, and serializes their + * wire values once. This client owns the snapshot, the per-handle working-set diff, the + * ClickHouse-backed backstop, and the wire response. * * Single-run (`streamRun`): - * - initial (`offset=-1`): hydrate + emit `insert` + `up-to-date` (with schema) - * - live: race a per-run notification vs a ~5s backstop and the abort signal, - * refetch, and emit a full-row `update` ONLY when `updatedAt` advanced past what - * the client has (a stale replica read never regresses); else a bare `up-to-date`. + * - initial (`offset=-1`): hydrate + emit `insert` + `up-to-date` (with schema). + * - live: the router wakes this feed when its run changes; emit a full-row `update` when + * `updatedAt` advanced past what the client has, else a bare `up-to-date`. The backstop + * re-checks via `getRunById`. * - * Multi-run feeds (`streamRuns` tag-list, `streamBatch`) share one core: - * - initial: resolve the matching id-set via ClickHouse `listRunIds` (filter-only, - * tag-OR or batchId), hydrate by-id from Postgres, emit N `insert`s. - * - live: one per-env subscription wakes the feed; re-resolve the set, hydrate it, - * and emit only new (`insert`) / advanced (`update`) rows — diffed on the - * authoritative Postgres `updatedAt` against a per-handle working set (cache miss - * falls back to the offset floor, merge-safe). ClickHouse supplies membership; - * Postgres supplies fresh row state, so CH ingest lag never stales the rows. - * Tag-list pins its `createdAt` window in the handle; batch needs no window. + * Multi-run feeds (`streamRuns` tag-list, `streamBatch`): + * - initial: resolve the matching id-set via ClickHouse (filter-only), hydrate by-id from + * Postgres, emit N `insert`s, seed the working set. + * - live: the router wakes the feed with the matched runs already hydrated + serialized; + * diff them on the authoritative Postgres `updatedAt` against the per-handle working + * set and emit only new/advanced rows. The backstop (timeout) does a full ClickHouse + * resolve — the correctness net that catches gaps and drops departed runs. * - * Tokens are opaque: `offset` = `_`, `handle` is per-shape, - * `cursor` is a live-only counter. The wire format is produced by - * `electricStreamProtocol`. + * Tokens are opaque: `offset` = `_`, `handle` is per-shape, `cursor` + * is a live-only counter. The wire format is produced by `electricStreamProtocol`. */ export class NotifierRealtimeClient implements RealtimeStreamClient { #seq = 0; readonly #workingSetCache: BoundedTtlCache; - /** Coalescing cache for the multi-run (resolveIds -> hydrateByIds) pair, keyed by - * (env, filter, columns). Collapses an env-wide wake's per-feed query fan-out into - * one shared resolve+hydrate per filter per short window. */ + /** Coalescing cache for the multi-run (resolveIds -> hydrateByIds) pair used by the + * initial snapshot and the backstop, keyed by (env, filter, columns). Collapses a + * reconnect/snapshot stampede of identical filters into one shared resolve+hydrate. */ readonly #runSetCache: BoundedTtlCache; readonly #runSetInflight = new Map>(); + /** Bounds concurrent fresh CH resolves (undefined => unbounded). */ + readonly #admissionGate?: ResolveAdmissionGate; constructor(private readonly options: NotifierRealtimeClientOptions) { this.#workingSetCache = new BoundedTtlCache( @@ -175,6 +242,10 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { options.runSetResolveCacheTtlMs ?? DEFAULT_RUNSET_CACHE_TTL_MS, options.runSetResolveCacheMaxEntries ?? DEFAULT_RUNSET_CACHE_MAX_ENTRIES ); + const admissionLimit = options.resolveAdmissionLimit ?? DEFAULT_RESOLVE_ADMISSION_LIMIT; + if (admissionLimit > 0) { + this.#admissionGate = new ResolveAdmissionGate(admissionLimit); + } } /** Current size of the per-handle working-set cache (for a metrics gauge). */ @@ -182,6 +253,11 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { return this.#workingSetCache.size; } + /** Fresh CH resolves currently holding an admission permit (for a metrics gauge). */ + get resolveAdmissionInUse(): number { + return this.#admissionGate?.inUse ?? 0; + } + async streamRun( url: URL | string, environment: RealtimeEnvironment, @@ -337,6 +413,12 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { }); } + /** + * Live poll for a single-run feed. The router wakes this feed when its run changes, + * with the run already hydrated + serialized (no ClickHouse, ever). On the backstop + * timeout it re-checks via `getRunById`. Only-on-advance: emit a full-row `update` when + * the row moved past what the client already has; else a bare `up-to-date`. + */ async #liveResponse(params: { environment: RealtimeEnvironment; runId: string; @@ -351,30 +433,67 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { params; return this.#withConcurrencySlot(environment, async () => { - const reason = await this.#waitForChange(runId, signal); - this.options.onWakeup?.(reason); - - const row = await this.options.runReader.getRunById(environment.id, runId); const lastSeenMs = parseOffsetUpdatedAtMs(offset); - const seq = this.#nextSeq(); - - // Only-on-advance: emit a full-row update when the replica row moved past - // what the client already has; otherwise a bare up-to-date keeps the offset. - // Live responses carry electric-cursor but NOT electric-schema (the client - // already has the schema from the initial snapshot) — matching real Electric. - if (row && row.updatedAt.getTime() > lastSeenMs) { - return this.#buildResponse(buildUpdateBody(row, skipColumns), apiVersion, clientVersion, { - offset: encodeOffset(row.updatedAt.getTime(), seq), + const registration = this.options.router.register( + environment.id, + { kind: "run", runId }, + skipColumns + ); + + try { + const { reason, rows } = await registration.waitForMatch(signal, this.#jitteredTimeout()); + this.options.onWakeup?.(reason); + + if (reason === "abort") { + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(this.#nextSeq()), + }); + } + + if (reason === "notify" && rows.length > 0) { + // The router hydrated + serialized this run; emit it (only on advance). + this.options.onLivePollPath?.("fast-hydrate"); + const matched = rows[0]; + const updatedAtMs = matched.row.updatedAt.getTime(); + const seq = this.#nextSeq(); + if (updatedAtMs > lastSeenMs) { + return this.#buildResponse( + buildRowsBodyFromSerialized([ + { runId: matched.row.id, value: matched.value, operation: "update" }, + ]), + apiVersion, + clientVersion, + { offset: encodeOffset(updatedAtMs, seq), handle, cursor: String(seq) } + ); + } + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(seq), + }); + } + + // Backstop timeout: re-check the run directly (no ClickHouse for the single-run feed). + this.options.onLivePollPath?.("full-resolve"); + const row = await this.options.runReader.getRunById(environment.id, runId); + const seq = this.#nextSeq(); + if (row && row.updatedAt.getTime() > lastSeenMs) { + return this.#buildResponse(buildUpdateBody(row, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(row.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + }); + } + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, handle, cursor: String(seq), }); + } finally { + registration.close(); } - - return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { - offset, - handle, - cursor: String(seq), - }); }); } @@ -409,8 +528,16 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { }); } - /** Live poll for a multi-run feed: wait, re-resolve the set, and emit only the - * rows that are new or advanced vs the cached working set. */ + /** + * Live poll for a multi-run feed. Two paths: + * - Fast path (router notify): the router woke us with the matched runs already + * membership-confirmed, hydrated, and serialized (no ClickHouse). Diff them against + * the per-handle working set and emit new/advanced rows. + * - Backstop (timeout): a full ClickHouse resolve + hydrate. The correctness net — + * catches members missed during a gap and drops runs that left the filter. + * With hold-on-empty (default) the connection holds until a real delta or the backstop + * rather than returning an empty response the client would re-issue. + */ async #runSetLiveResponse( environment: RealtimeListEnvironment, filter: RunSetFilter, @@ -431,94 +558,189 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { // miss) and advanced on each refetch within this held request. let prevSeen = this.#workingSetCache.get(handle); - // The per-env channel wakes this feed on ANY run change in the environment, but - // most changes don't match this feed's filter. Rather than return an empty - // up-to-date the client would immediately re-issue (the dominant cost under a - // busy env), we hold the connection and only respond when THIS feed has a real - // delta or the backstop elapses. Each wake re-resolves via the coalesced + - // short-TTL cache, so an env-wide wake never fans out into per-feed CH+PG queries. - while (true) { - const remaining = deadline - Date.now(); - // One env-scoped subscription per wait (not one per run); re-subscribed each - // loop until a relevant delta or the budget runs out. - const reason = - remaining > 0 ? await this.#waitForEnvChange(environment.id, signal, remaining) : "timeout"; - this.options.onWakeup?.(reason); + const emitFromSerialized = (changes: SerializedRowChange[], maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + return this.#buildResponse(buildRowsBodyFromSerialized(changes), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + const emitFromRows = (changes: RowChange[], maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; + const emitUpToDate = (maxUpdatedAt: number): Response => { + const seq = this.#nextSeq(); + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset: encodeOffset(maxUpdatedAt, seq), + handle, + cursor: String(seq), + }); + }; - if (reason === "abort") { - // Client disconnected; the response is discarded. Skip the refetch. - return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { - offset, - handle, - cursor: String(this.#nextSeq()), - }); - } + const registration = this.options.router.register( + environment.id, + this.#feedFilter(filter), + skipColumns + ); - // ClickHouse resolves the (possibly stale) membership; Postgres hydrates the - // authoritative current rows, so status is always fresh even if CH lags. We - // refetch on every wake AND on the final timeout, so a wake missed during the - // brief re-subscribe gap is still caught by the backstop. - const rows = await this.#resolveAndHydrate(environment, filter, skipColumns); - - // Diff against what the client already has, using the hydrated updatedAt: - // prior working set => per-row (new = insert, advanced = update); miss => - // anything newer than the offset floor as a merge-safe update. - const changes: RowChange[] = []; - const seen: WorkingSet = new Map(); - let maxUpdatedAt = offsetFloorMs; - for (const row of rows) { - const updatedAtMs = row.updatedAt.getTime(); - seen.set(row.id, updatedAtMs); - maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); - - if (prevSeen) { - const prior = prevSeen.get(row.id); - if (prior === undefined) { - changes.push({ row, operation: "insert" }); - } else if (updatedAtMs > prior) { - changes.push({ row, operation: "update" }); + try { + while (true) { + const remaining = deadline - Date.now(); + const { reason, rows } = + remaining > 0 + ? await registration.waitForMatch(signal, remaining) + : { reason: "timeout" as const, rows: [] as MatchedRow[] }; + this.options.onWakeup?.(reason); + + if (reason === "abort") { + return emitUpToDate(offsetFloorMs); + } + + // FAST PATH: the router already confirmed membership + the createdAt window and + // hydrated/serialized the matched runs. Just diff against the working set. + if (reason === "notify") { + this.options.onLivePollPath?.("fast-hydrate"); + const { changes, maxUpdatedAt, touched } = this.#diffMatched( + rows, + prevSeen, + offsetFloorMs + ); + // Merge (not replace): the router only surfaced the changed subset, so keep the + // rest of the working set intact. The backstop full-resolve rebuilds it. + const merged = this.#mergeWorkingSet(prevSeen, touched); + this.#workingSetCache.set(handle, merged); + prevSeen = merged; + + if (changes.length > 0) { + return emitFromSerialized(changes, maxUpdatedAt); + } + // Matched but no row advanced (already seen). Keep holding. + if (holdOnEmpty) { + continue; } - } else if (updatedAtMs > offsetFloorMs) { - changes.push({ row, operation: "update" }); + return emitUpToDate(maxUpdatedAt); } + + // BACKSTOP: full ClickHouse resolve + hydrate. Replaces the working set so runs + // that left the filter stop being tracked (the client keeps showing them). + this.options.onLivePollPath?.("full-resolve"); + const resolved = await this.#resolveAndHydrate(environment, filter, skipColumns); + const { changes, maxUpdatedAt, touched } = this.#diffRows( + resolved, + prevSeen, + offsetFloorMs + ); + this.#workingSetCache.set(handle, touched); + prevSeen = touched; + + if (changes.length > 0) { + return emitFromRows(changes, maxUpdatedAt); + } + // Empty backstop diff: timeout returns up-to-date; (holdOnEmpty never reaches + // here on a notify — those are handled in the fast path above). + return emitUpToDate(maxUpdatedAt); } + } finally { + registration.close(); + } + }); + } - // Refresh the working set so runs that left the filter stop being tracked - // (the client keeps showing them; the SDK never applies deletes). - this.#workingSetCache.set(handle, seen); - prevSeen = seen; + /** Translate a multi-run filter into the router's membership predicate. */ + #feedFilter(filter: RunSetFilter): FeedFilter { + if (filter.batchId !== undefined) { + return { kind: "batch", batchId: filter.batchId }; + } + return { + kind: "tag", + tags: filter.tags ?? [], + createdAtFloorMs: filter.createdAtAfter?.getTime(), + }; + } - if (changes.length > 0) { - const seq = this.#nextSeq(); - return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { - offset: encodeOffset(maxUpdatedAt, seq), - handle, - cursor: String(seq), - }); + /** Diff router-matched rows (already serialized) against the prior working set, pairing + * each row's shared `value` with this feed's operation. */ + #diffMatched( + matched: MatchedRow[], + prevSeen: WorkingSet | undefined, + offsetFloorMs: number + ): { changes: SerializedRowChange[]; maxUpdatedAt: number; touched: WorkingSet } { + const changes: SerializedRowChange[] = []; + const touched: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const { row, value } of matched) { + const updatedAtMs = row.updatedAt.getTime(); + touched.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (prevSeen) { + const prior = prevSeen.get(row.id); + if (prior === undefined) { + changes.push({ runId: row.id, value, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ runId: row.id, value, operation: "update" }); } + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ runId: row.id, value, operation: "update" }); + } + } + return { changes, maxUpdatedAt, touched }; + } - // Empty diff. With hold-on-empty (default) keep waiting until a real delta or - // the budget elapses; otherwise fall back to the legacy per-wake up-to-date. - if (reason === "timeout" || !holdOnEmpty) { - const seq = this.#nextSeq(); - return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { - offset: encodeOffset(maxUpdatedAt, seq), - handle, - cursor: String(seq), - }); + /** + * Diff hydrated rows against the prior working set on the authoritative Postgres + * `updatedAt`: a run not in the set is an `insert`, one whose `updatedAt` advanced is an + * `update`. On a working-set miss, anything past the offset floor is a merge-safe + * `update`. Used by the snapshot and the backstop full-resolve. + */ + #diffRows( + rows: RealtimeRunRow[], + prevSeen: WorkingSet | undefined, + offsetFloorMs: number + ): { changes: RowChange[]; maxUpdatedAt: number; touched: WorkingSet } { + const changes: RowChange[] = []; + const touched: WorkingSet = new Map(); + let maxUpdatedAt = offsetFloorMs; + for (const row of rows) { + const updatedAtMs = row.updatedAt.getTime(); + touched.set(row.id, updatedAtMs); + maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); + + if (prevSeen) { + const prior = prevSeen.get(row.id); + if (prior === undefined) { + changes.push({ row, operation: "insert" }); + } else if (updatedAtMs > prior) { + changes.push({ row, operation: "update" }); } - // reason === "notify" with an empty diff: keep holding (loop, re-subscribe). + } else if (updatedAtMs > offsetFloorMs) { + changes.push({ row, operation: "update" }); } - }); + } + return { changes, maxUpdatedAt, touched }; + } + + /** Merge fast-path touched rows into the prior working set. The fast path only saw the + * changed subset, so we keep the rest (the backstop full-resolve does the exact rebuild). */ + #mergeWorkingSet(prevSeen: WorkingSet | undefined, touched: WorkingSet): WorkingSet { + const merged: WorkingSet = new Map(prevSeen ?? undefined); + for (const [id, updatedAtMs] of touched) { + merged.set(id, updatedAtMs); + } + return merged; } /** - * Resolve the filter's id-set (ClickHouse) and hydrate the rows (Postgres), - * coalesced + short-TTL cached by (env, filter, columns). Every batch feed for a - * batch, and every tag feed sharing tags+window+columns, shares ONE resolve+hydrate - * instead of each firing its own when the per-env channel wakes them together. - * Concurrent callers await an in-flight resolve; callers within the TTL reuse the - * cached rows (staleness budget: up to the TTL; the next live poll catches up). + * Resolve the filter's id-set (ClickHouse) and hydrate the rows (Postgres), coalesced + + * short-TTL cached by (env, filter, columns). Used by the initial snapshot and the + * backstop. A reconnect/snapshot stampede of identical filters shares ONE resolve+hydrate + * (concurrent callers await the in-flight one; callers within the TTL reuse the rows). */ async #resolveAndHydrate( environment: RealtimeListEnvironment, @@ -540,7 +762,9 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { } this.options.onRunSetResolve?.("miss"); - const promise = this.#resolveAndHydrateUncached(environment, filter, skipColumns) + // Registered in #runSetInflight synchronously below, so same-filter callers that arrive + // while this is still waiting for an admission permit coalesce onto it (one permit, not N). + const promise = this.#admitAndResolveUncached(environment, filter, skipColumns) .then((rows) => { this.#runSetCache.set(key, rows); return rows; @@ -553,6 +777,29 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { return promise; } + /** Acquire an admission permit (if the gate is enabled) before the fresh CH+PG resolve, so + * a distinct-filter stampede is throttled to the configured concurrency. */ + async #admitAndResolveUncached( + environment: RealtimeListEnvironment, + filter: RunSetFilter, + skipColumns: string[] + ): Promise { + if (!this.#admissionGate) { + return this.#resolveAndHydrateUncached(environment, filter, skipColumns); + } + const waitStart = Date.now(); + await this.#admissionGate.acquire(); + const waited = Date.now() - waitStart; + if (waited > 0) { + this.options.onResolveAdmissionWait?.(waited); + } + try { + return await this.#resolveAndHydrateUncached(environment, filter, skipColumns); + } finally { + this.#admissionGate.release(); + } + } + async #resolveAndHydrateUncached( environment: RealtimeListEnvironment, filter: RunSetFilter, @@ -697,64 +944,6 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { } } - #waitForChange(runId: string, signal?: AbortSignal, timeoutMs?: number): Promise { - return this.#waitForSubscription( - this.options.notifier.subscribeToRunChanges(runId), - signal, - timeoutMs - ); - } - - #waitForEnvChange( - environmentId: string, - signal?: AbortSignal, - timeoutMs?: number - ): Promise { - return this.#waitForSubscription( - this.options.notifier.subscribeToEnvChanges(environmentId), - signal, - timeoutMs - ); - } - - /** Race a notifier subscription against a timeout (the jittered backstop by default, - * or an explicit remaining budget when a live request holds across wakes) and the - * abort signal. */ - async #waitForSubscription( - subscription: RunChangeSubscription, - signal?: AbortSignal, - timeoutMs?: number - ): Promise { - if (signal?.aborted) { - subscription.unsubscribe(); - return "abort"; - } - - let timer: ReturnType | undefined; - let onAbort: (() => void) | undefined; - - try { - return await new Promise((resolve) => { - subscription.changed.then(() => resolve("notify")).catch(() => resolve("timeout")); - - timer = setTimeout(() => resolve("timeout"), timeoutMs ?? this.#jitteredTimeout()); - - if (signal) { - onAbort = () => resolve("abort"); - signal.addEventListener("abort", onAbort, { once: true }); - } - }); - } finally { - if (timer) { - clearTimeout(timer); - } - if (signal && onAbort) { - signal.removeEventListener("abort", onAbort); - } - subscription.unsubscribe(); - } - } - #jitteredTimeout(): number { const base = this.options.livePollTimeoutMs ?? DEFAULT_LIVE_POLL_TIMEOUT_MS; // +/-15% jitter to avoid synchronized refetch herds. diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts index 7b486b5a1b8..24d5f13b0c6 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts @@ -6,6 +6,7 @@ import { singleton } from "~/utils/singleton"; import { getCachedLimit } from "../platform.v3.server"; import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; +import { EnvChangeRouter } from "./envChangeRouter.server"; import { NotifierRealtimeClient } from "./notifierRealtimeClient.server"; import { RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; import { getRunChangeNotifier } from "./runChangeNotifierInstance.server"; @@ -19,7 +20,7 @@ import { RunHydrator } from "./runReader.server"; function initializeNotifierRealtimeClient(): NotifierRealtimeClient { const wakeups = new Counter({ name: "realtime_notifier_wakeups_total", - help: "Live realtime notifier wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishRunChanged delegate.", + help: "Live realtime notifier wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishChangeRecord delegate.", labelNames: ["reason"] as const, registers: [metricsRegister], }); @@ -39,6 +40,25 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { registers: [metricsRegister], }); + const livePollPaths = new Counter({ + name: "realtime_notifier_live_poll_total", + help: "How live polls resolved. 'fast-hydrate' = the router woke the feed with matched runs hydrated by id (no ClickHouse); 'full-resolve' = the backstop timeout did a ClickHouse resolve. A high fast-path share is the local-membership routing working.", + labelNames: ["path"] as const, + registers: [metricsRegister], + }); + + const routerHydrates = new Counter({ + name: "realtime_notifier_router_hydrated_runs_total", + help: "Runs hydrated by the EnvChangeRouter's batch-hydrate (one query per column set per wake, shared across all feeds matching the same run — the hot-shared-tag fan-out collapse).", + registers: [metricsRegister], + }); + + const resolveAdmissionWaits = new Counter({ + name: "realtime_notifier_resolve_admission_waits_total", + help: "Fresh ClickHouse resolves that had to queue for an admission permit. A rising count means a distinct-filter reconnect stampede is being throttled (the gate is doing its job).", + registers: [metricsRegister], + }); + const limiter = new RealtimeConcurrencyLimiter({ keyPrefix: "tr:realtime:notifier:concurrency", redis: { @@ -51,14 +71,24 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { }, }); + // One RunHydrator shared by the router (fast-path batch-hydrate) and the client + // (snapshot + backstop), so its single-flight + short-TTL cache covers both. + const runReader = new RunHydrator({ replica: $replica }); + + const router = new EnvChangeRouter({ + source: getRunChangeNotifier(), + hydrator: runReader, + onHydrate: (runCount) => routerHydrates.inc(runCount), + }); + const client = new NotifierRealtimeClient({ - runReader: new RunHydrator({ replica: $replica }), + runReader, runListResolver: new ClickHouseRunListResolver({ getClickhouse: (organizationId) => clickhouseFactory.getClickhouseForOrganization(organizationId, "realtime"), prisma: $replica, }), - notifier: getRunChangeNotifier(), + router, limiter, cachedLimitProvider: { async getCachedLimit(organizationId, defaultValue) { @@ -78,9 +108,12 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { listCacheMaxEntries: env.REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES, runSetCreatedAtBucketMs: env.REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS, holdOnEmpty: env.REALTIME_NOTIFIER_HOLD_ON_EMPTY === "1", + resolveAdmissionLimit: env.REALTIME_NOTIFIER_RESOLVE_ADMISSION_LIMIT, onWakeup: (reason) => wakeups.inc({ reason }), + onLivePollPath: (path) => livePollPaths.inc({ path }), onRunSetResolve: (result) => runSetResolves.inc({ result }), onRunSetQuery: (stage, ms) => runSetQueryMs.observe({ stage }, ms), + onResolveAdmissionWait: () => resolveAdmissionWaits.inc(), }); new Gauge({ @@ -92,6 +125,15 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { }, }); + new Gauge({ + name: "realtime_notifier_resolve_admission_in_use", + help: "Fresh ClickHouse resolves currently holding an admission permit (live concurrency against the gate's limit).", + registers: [metricsRegister], + collect() { + this.set(client.resolveAdmissionInUse); + }, + }); + return client; } diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts index 45b65732ed8..f975af05723 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -1,81 +1,128 @@ import { createRedisClient, RedisClient, RedisWithClusterOptions } from "~/redis.server"; import { logger } from "../logger.server"; -export type RunChangeInput = { +export const CHANGE_RECORD_VERSION = 1; + +/** + * A run-change fact, published once to the run's environment channel. Self-describing: + * - `envId` routes it to its channel (mandatory). + * - `tags` / `batchId` let a tag/batch feed decide membership LOCALLY, without a + * ClickHouse re-resolve. `tags` present (even `[]`) marks a "full" record; `tags` + * absent marks a "partial" record (envId+runId only) that a tag feed must hydrate to + * classify. `batchId` present only when the run is in a batch. + * - `runId` lets a single-run feed match; `createdAtMs` lets a tag feed apply its + * createdAt floor locally; `updatedAtMs`/`status` are hints. + * Row state (payload/output/...) is never on the wire — it's refetched from Postgres. + */ +export type ChangeRecord = { + v: number; runId: string; - /** - * Optional. The single-run channel is keyed by runId alone; environmentId is - * carried for the per-env channels and metrics. Write sites that don't - * have it cheaply in scope may omit it. - */ - environmentId?: string; - /** Optional monotonic hint; not required since consumers always refetch. */ - version?: number; + envId: string; + tags?: string[]; + batchId?: string | null; + createdAtMs?: number; + updatedAtMs?: number; + status?: string; }; +/** What a publish site provides; the notifier stamps the version. */ +export type ChangeRecordInput = Omit; + +export function encodeChangeRecord(record: ChangeRecord): string { + return JSON.stringify(record); +} + +/** Decode a wire message into a ChangeRecord. Tolerant of a bare runId (no membership + * data) so a malformed/legacy frame degrades to a partial record (hydrate-to-classify) + * rather than throwing. */ +export function decodeChangeRecord(message: string): ChangeRecord { + if (message.length === 0 || message[0] !== "{") { + return { v: 0, runId: message, envId: "" }; + } + try { + const parsed = JSON.parse(message) as Partial; + if (parsed && typeof parsed.runId === "string") { + return { + v: parsed.v ?? 0, + runId: parsed.runId, + envId: parsed.envId ?? "", + tags: parsed.tags, + batchId: parsed.batchId, + createdAtMs: parsed.createdAtMs, + updatedAtMs: parsed.updatedAtMs, + status: parsed.status, + }; + } + } catch { + // fall through to the bare-runId fallback + } + return { v: 0, runId: message, envId: "" }; +} + export type RunChangeNotifierOptions = { redis: RedisWithClusterOptions; - /** Channel name prefix; the runId is appended inside a hash-tag for slot locality. */ + /** Channel name prefix; the envId is appended inside a hash-tag for slot locality. */ channelPrefix?: string; connectionName?: string; /** - * Leading-edge throttle (ms) for the high-volume per-env channel: deliver the - * first wake immediately, then at most one more per window while changes keep - * arriving. Bounds the feed-wake rate per env regardless of run throughput. - * Defaults to 100ms. 0 disables coalescing (wake on every message). + * Leading-edge throttle (ms) for the per-env channel: deliver the first wake + * immediately, then at most one more per window while changes keep arriving. Bounds the + * wake rate per env regardless of run throughput. Defaults to 100ms. 0 disables it. */ envWakeCoalesceWindowMs?: number; /** - * Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH) instead of classic pub/sub. - * Only valid against a Redis Cluster (the channels are hash-tagged by run/env id, - * so each lands on one shard) and requires the client to be built with - * `clusterOptions.shardedSubscribers: true`. Classic PUBLISH in a cluster - * broadcasts to every node, so sharded pub/sub is what actually distributes the - * load. Defaults to false (classic pub/sub, for single-node / local). + * Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH) instead of classic pub/sub. Only + * valid against a Redis Cluster (channels are hash-tagged by envId, so each lands on one + * shard) and requires the client built with `clusterOptions.shardedSubscribers: true`. + * Classic PUBLISH in a cluster broadcasts to every node, so sharded pub/sub is what + * actually distributes the load. Defaults to false (classic, for single-node / local). */ shardedPubSub?: boolean; }; -export type RunChangeSubscription = { - /** Resolves the next time a change is published for the subscribed run. */ - changed: Promise; - unsubscribe: () => void; -}; - const DEFAULT_CHANNEL_PREFIX = "realtime:"; const DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS = 100; /** - * RunChangeNotifier — the single, encapsulated module that carries "run X changed" - * signals from write sites to the realtime feed. + * RunChangeNotifier — carries "run X changed" facts from write sites to the realtime + * feed over ONE per-environment channel. * * Design constraints baked in here: - * - IDs only on the wire, never row data. Consumers refetch from Postgres. + * - ONE channel type, `env:{}`. A change is one fact published once; who + * cares about it is a predicate evaluated by the consumer (the EnvChangeRouter), not a + * second channel. Single-run, tag, and batch feeds all read this one stream. + * - Minimal wire data (a self-describing `ChangeRecord` of small keys), never row + * columns. Row state is always refetched from Postgres. * - ONE shared, multiplexed subscriber connection per process with a refcounted - * `Map>` (per-run + per-env channels). The RunQueue - * pattern, deliberately NOT - * the per-subscribe-connection pattern of ZodPubSub/tracePubSub (which would - * exhaust ElastiCache `maxclients`). - * - Connections are created lazily: a process that never publishes or subscribes - * (the default, flag-off state) opens no Redis connections at all. - * - `publish` is fire-and-forget and never throws; a dropped publish only costs - * latency because the consumer has a timeout backstop. + * `Map>`. The RunQueue pattern, deliberately NOT the + * per-subscribe-connection pattern of ZodPubSub/tracePubSub (which would exhaust + * ElastiCache `maxclients`). + * - Connections are created lazily: a process that never publishes or subscribes (the + * default, flag-off state) opens no Redis connections at all. + * - `publish` is fire-and-forget and never throws; a dropped publish only costs latency + * because the consumer has a timeout backstop. * - * Channels are hash-tagged (`{}` / `env:{}`) so they - * land on a single cluster slot. With `shardedPubSub` (cluster only) the feed uses - * SSUBSCRIBE/SPUBLISH so each run/env's traffic stays on one shard rather than - * broadcasting cluster-wide; classic pub/sub is used single-node. + * Channels are hash-tagged (`env:{}`) so an env's traffic lands on one + * cluster slot. With `shardedPubSub` (cluster only) the feed uses SSUBSCRIBE/SPUBLISH so + * each env's traffic stays on one shard rather than broadcasting cluster-wide. */ export class RunChangeNotifier { #publisher: RedisClient | undefined; #subscriber: RedisClient | undefined; - readonly #listeners = new Map void>>(); + readonly #listeners = new Map void>>(); + /** + * Per-channel accumulator of records since the last delivery, deduped by runId. A + * coalesced env window collapses many publishes into one wake; this holds the batch so + * the wake carries every run that moved, not just the last one (latest record per run + * wins, keeping the freshest keys). + */ + readonly #pending = new Map>(); readonly #channelPrefix: string; readonly #connectionName: string; readonly #coalesceWindowMs: number; /** When true, use sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) — see options. */ readonly #sharded: boolean; - /** Active coalescing windows per channel (env channels only). */ + /** Active coalescing windows per channel. */ readonly #coalesceTimers = new Map>(); /** Channels that received a message while their window was open (need a trailing wake). */ readonly #coalesceDirty = new Set(); @@ -88,23 +135,26 @@ export class RunChangeNotifier { } /** - * Fire-and-forget publish of a run-changed signal. Never throws. Publishes to - * the per-run channel (single-run feed) and, when environmentId is known, the - * per-env channel (tag/list feed). Payload is the runId so env consumers can - * tell which run moved. IDs only, never row data. + * Fire-and-forget publish of a run-changed fact to the run's environment channel. Never + * throws. The notifier stamps the record version. */ - publish(input: RunChangeInput): void { - this.#publishToChannel(this.#channelForRun(input.runId), input.runId); - if (input.environmentId) { - this.#publishToChannel(this.#channelForEnv(input.environmentId), input.runId); + publish(input: ChangeRecordInput): void { + const record: ChangeRecord = { v: CHANGE_RECORD_VERSION, ...input }; + this.#publishToChannel(this.#channelForEnv(record.envId), encodeChangeRecord(record)); + } + + /** Fire-and-forget publish of many run-changed facts. Never throws. */ + publishMany(inputs: ChangeRecordInput[]): void { + for (const input of inputs) { + this.publish(input); } } #publishToChannel(channel: string, payload: string): void { try { const publisher = this.#ensurePublisher(); - // Sharded pub/sub (SPUBLISH) routes to the channel's slot owner; classic - // PUBLISH broadcasts cluster-wide. The channel is hash-tagged by run/env id. + // Sharded pub/sub (SPUBLISH) routes to the channel's slot owner; classic PUBLISH + // broadcasts cluster-wide. The channel is hash-tagged by envId. const result = this.#sharded ? publisher.spublish(channel, payload) : publisher.publish(channel, payload); @@ -124,40 +174,16 @@ export class RunChangeNotifier { } } - /** Fire-and-forget publish of many run-changed signals. Never throws. */ - publishMany(inputs: RunChangeInput[]): void { - for (const input of inputs) { - this.publish(input); - } - } - - /** - * Subscribe to the next change for a single run (single-run feed). - */ - subscribeToRunChanges(runId: string): RunChangeSubscription { - return this.#subscribe(this.#channelForRun(runId)); - } - - /** - * Subscribe to the next change of ANY run in an environment (tag/list feed). - * The consumer re-resolves its filter on each wake. - */ - subscribeToEnvChanges(environmentId: string): RunChangeSubscription { - return this.#subscribe(this.#channelForEnv(environmentId)); - } - /** - * Refcounted subscribe over the shared subscriber, keyed by the full channel: - * the first listener for a channel issues SUBSCRIBE, the last one UNSUBSCRIBE. + * Subscribe (persistently) to an environment's run-change stream. `onBatch` is invoked + * with the coalesced batch of records on every wake until the returned unsubscribe is + * called. Refcounted over the shared subscriber: the first listener for an env issues + * SUBSCRIBE, the last one UNSUBSCRIBE. */ - #subscribe(channel: string): RunChangeSubscription { + subscribeToEnv(environmentId: string, onBatch: (records: ChangeRecord[]) => void): () => void { + const channel = this.#channelForEnv(environmentId); const subscriber = this.#ensureSubscriber(); - let resolveChanged: () => void = () => {}; - const changed = new Promise((resolve) => { - resolveChanged = resolve; - }); - let listeners = this.#listeners.get(channel); if (!listeners) { listeners = new Set(); @@ -169,10 +195,10 @@ export class RunChangeNotifier { }); }); } - listeners.add(resolveChanged); + listeners.add(onBatch); let unsubscribed = false; - const unsubscribe = () => { + return () => { if (unsubscribed) { return; } @@ -182,12 +208,12 @@ export class RunChangeNotifier { if (!current) { return; } - current.delete(resolveChanged); + current.delete(onBatch); if (current.size === 0) { - // Drop the channel from the map only AFTER Redis confirms UNSUBSCRIBE, and - // only if no new listener re-subscribed while it was in flight. The map - // entry's existence mirrors "subscribed (or subscribe in flight) in Redis", - // so the subscribe path safely reuses it without a duplicate SUBSCRIBE. + // Drop the channel from the map only AFTER Redis confirms UNSUBSCRIBE, and only if + // no new listener re-subscribed while it was in flight. The map entry's existence + // mirrors "subscribed (or subscribe in flight) in Redis", so the subscribe path + // safely reuses it without a duplicate SUBSCRIBE. this.#unsubscribeChannel(subscriber, channel) .then(() => { const latest = this.#listeners.get(channel); @@ -197,9 +223,9 @@ export class RunChangeNotifier { if (latest.size === 0) { this.#listeners.delete(channel); } else { - // A listener arrived during the in-flight UNSUBSCRIBE; the channel is - // now unsubscribed in Redis but has live waiters. Re-subscribe so they - // still receive messages (the long-poll backstop covers the gap). + // A listener arrived during the in-flight UNSUBSCRIBE; the channel is now + // unsubscribed in Redis but has live listeners. Re-subscribe so they keep + // receiving messages (the long-poll backstop covers the gap). this.#subscribeChannel(subscriber, channel).catch((error) => { logger.error("[runChangeNotifier] Failed to re-subscribe to run-change channel", { error, @@ -209,9 +235,9 @@ export class RunChangeNotifier { } }) .catch((error) => { - // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. - // Keep the (empty) map entry so a future subscriber reuses it without a - // duplicate SUBSCRIBE and #onMessage stays consistent with Redis state. + // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. Keep the + // (empty) map entry so a future subscriber reuses it without a duplicate + // SUBSCRIBE and #onMessage stays consistent with Redis state. logger.error("[runChangeNotifier] Failed to unsubscribe from run-change channel", { error, channel, @@ -219,11 +245,9 @@ export class RunChangeNotifier { }); } }; - - return { changed, unsubscribe }; } - /** Number of distinct channels currently subscribed (for metrics). */ + /** Number of distinct env channels currently subscribed (for metrics). */ get activeSubscriptionCount(): number { return this.#listeners.size; } @@ -234,6 +258,7 @@ export class RunChangeNotifier { } this.#coalesceTimers.clear(); this.#coalesceDirty.clear(); + this.#pending.clear(); await Promise.allSettled([this.#subscriber?.quit(), this.#publisher?.quit()]); this.#subscriber = undefined; this.#publisher = undefined; @@ -250,9 +275,9 @@ export class RunChangeNotifier { #ensureSubscriber(): RedisClient { if (!this.#subscriber) { const subscriber = createRedisClient(`${this.#connectionName}:sub`, this.options.redis); - const onMessage = (channel: string) => this.#onMessage(channel); - // Classic pub/sub delivers "message"; sharded pub/sub delivers "smessage". - // Register both so the delivery path is identical regardless of mode. + const onMessage = (channel: string, message: string) => this.#onMessage(channel, message); + // Classic pub/sub delivers "message"; sharded pub/sub delivers "smessage". Register + // both so the delivery path is identical regardless of mode. subscriber.on("message", onMessage); subscriber.on("smessage", onMessage); this.#subscriber = subscriber; @@ -270,34 +295,50 @@ export class RunChangeNotifier { return this.#sharded ? subscriber.sunsubscribe(channel) : subscriber.unsubscribe(channel); } - #onMessage(channel: string) { - // The per-env channel carries a busy environment's entire run-change firehose to - // every tag/batch feed, so throttle it; the per-run channel is low-volume and - // latency-sensitive, so deliver it immediately. - if (this.#coalesceWindowMs > 0 && this.#isEnvChannel(channel)) { + #onMessage(channel: string, message: string) { + // Accumulate the decoded record (deduped by runId) before delivering, so a coalesced + // wake carries every run that moved during the window. + this.#addPending(channel, decodeChangeRecord(message)); + + if (this.#coalesceWindowMs > 0) { this.#deliverCoalesced(channel); return; } this.#deliver(channel); } + /** Accumulate a record into the channel's pending batch, deduped by runId (a later + * record for the same run replaces the earlier one, keeping the freshest keys). */ + #addPending(channel: string, record: ChangeRecord) { + let batch = this.#pending.get(channel); + if (!batch) { + batch = new Map(); + this.#pending.set(channel, batch); + } + batch.set(record.runId, record); + } + #deliver(channel: string) { + // Drain the accumulated batch (and clear it) so listeners woken now get every run that + // changed since the last delivery, and a later message starts a fresh batch. + const batchMap = this.#pending.get(channel); + const batch = batchMap ? [...batchMap.values()] : []; + this.#pending.delete(channel); + const listeners = this.#listeners.get(channel); - if (!listeners) { + if (!listeners || batch.length === 0) { return; } - // One-shot: each waiter resolves its race and removes itself via unsubscribe(). - for (const resolve of [...listeners]) { - resolve(); + for (const onBatch of [...listeners]) { + onBatch(batch); } } /** - * Leading-edge throttle: deliver the first wake immediately, then suppress further - * wakes for the window, delivering one trailing wake if any messages arrived during - * it (and re-opening while activity continues). Caps the feed-wake rate per env to - * ~1/window no matter how fast runs change. Lossless: consumers refetch current - * state on a wake, so a coalesced burst is captured by the next refetch. + * Leading-edge throttle: deliver the first wake immediately, then suppress further wakes + * for the window, delivering one trailing wake if any messages arrived during it (and + * re-opening while activity continues). Caps the wake rate per env to ~1/window no + * matter how fast runs change. Lossless: the batch accumulates across the window. */ #deliverCoalesced(channel: string) { if (this.#coalesceTimers.has(channel)) { @@ -321,16 +362,8 @@ export class RunChangeNotifier { this.#coalesceTimers.set(channel, timer); } - #isEnvChannel(channel: string): boolean { - return channel.startsWith(`${this.#channelPrefix}env:`); - } - - // Channels are hash-tagged (`...{}`) so all of a run's/env's traffic maps to - // one cluster slot (one shard) under sharded pub/sub. - #channelForRun(runId: string): string { - return `${this.#channelPrefix}run:{${runId}}`; - } - + // Hash-tagged (`...{}`) so all of an env's traffic maps to one cluster slot (one + // shard) under sharded pub/sub. #channelForEnv(environmentId: string): string { return `${this.#channelPrefix}env:{${environmentId}}`; } diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts index 9ed93e66a4a..fa5f5681f90 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -1,78 +1,101 @@ import { env } from "~/env.server"; import { engine } from "~/v3/runEngine.server"; import { logger } from "../logger.server"; -import { publishRunChanged } from "./runChangeNotifierInstance.server"; +import { publishChangeRecord } from "./runChangeNotifierInstance.server"; /** - * Registers the run-changed delegations as additive listeners on the Run Engine - * 2.0 event bus. All logic lives in the notifier - * module; each listener here is a one-line, fire-and-forget delegate. Because - * they only attach to engine events, they cover V2 runs exclusively (V1/MarQS - * never reach this engine), and they're trivially reversible (delete this file + - * its boot registration). + * ChangeRecordBuilder — builds and publishes a self-describing `ChangeRecord` to the run's + * environment channel for the lifecycle events whose engine-bus payload already carries + * env + tags + batchId. One publish per change; `envId` is always present. * - * Coverage is intentionally not exhaustive: a dropped or uncovered transition - * only adds latency because the consumer has a ~5s refetch backstop. We cover the - * high-value, env-cheap transitions here. + * The terminal transitions (runSucceeded/runFailed/runExpired/runCancelled), + * runAttemptFailed, and runMetadataUpdated publish from `runEngineHandlers.server.ts` + * instead — those events don't carry env/tags/batchId on the bus, but that file already + * re-reads the run (or resolves the env) for each, so the publish piggybacks on the + * existing read rather than widening the event bus. So fully disabling publishing is the + * env master switch (`REALTIME_NOTIFIER_ENABLED`), not just deleting this file. + * + * Coverage is intentionally not exhaustive: a dropped or uncovered transition only adds + * latency because the consumer has a periodic backstop full-resolve. */ export function registerRunChangeNotifierHandlers() { - // Return a truthy value in every path so the singleton() wrapper (which uses - // ??=) caches the result and never re-runs this factory — re-running would - // attach duplicate engine-bus listeners on each Remix dev-mode reload. + // Return a truthy value in every path so the singleton() wrapper (which uses ??=) caches + // the result and never re-runs this factory — re-running would attach duplicate + // engine-bus listeners on each Remix dev-mode reload. if (env.REALTIME_NOTIFIER_ENABLED !== "1") { return true; } - // Status transitions (checkpoint suspend/resume, pending version, dequeue) — - // environment.id is in the payload. + // Run created (trigger). The first signal a tag/batch feed gets for a brand-new run: a + // freshly-created run is born QUEUED with no status transition, so without this it only + // surfaces on the consumer's periodic backstop resolve (and not at all before ClickHouse + // ingests it). Routing the create record hydrates the new run by id straight from Postgres. + engine.eventBus.on("runCreated", ({ run, environment }) => { + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); + }); + + // Status transitions (checkpoint suspend/resume, pending version, dequeue). engine.eventBus.on("runStatusChanged", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); }); // Dequeue/lock (sets startedAt) and attempt start (DEQUEUED -> EXECUTING) — the // most-watched "my run started" transitions. engine.eventBus.on("runLocked", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); }); engine.eventBus.on("runAttemptStarted", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); }); - // Terminal + failure transitions. - engine.eventBus.on("runSucceeded", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); - }); - engine.eventBus.on("runFailed", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); - }); - engine.eventBus.on("runExpired", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); - }); - engine.eventBus.on("runCancelled", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); - }); engine.eventBus.on("runRetryScheduled", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); }); // Delay lifecycle (delayUntil / queued-after-delay changes). engine.eventBus.on("runDelayRescheduled", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); }); engine.eventBus.on("runEnqueuedAfterDelay", ({ run, environment }) => { - publishRunChanged({ runId: run.id, environmentId: environment.id }); - }); - - // Attempt failures and metadata updates don't carry environmentId, but the - // single-run channel is keyed by runId alone. - engine.eventBus.on("runAttemptFailed", ({ run }) => { - publishRunChanged({ runId: run.id }); - }); - engine.eventBus.on("runMetadataUpdated", ({ run }) => { - publishRunChanged({ runId: run.id }); + publishChangeRecord({ + runId: run.id, + envId: environment.id, + tags: run.runTags, + batchId: run.batchId, + }); }); - logger.info("[runChangeNotifier] realtime run-change notifier handlers registered"); + logger.info("[runChangeNotifier] realtime change-record builder registered"); return true; } diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index c49f7706042..ed1d1ce12b2 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -2,11 +2,7 @@ import { Gauge } from "prom-client"; import { env } from "~/env.server"; import { metricsRegister } from "~/metrics.server"; import { singleton } from "~/utils/singleton"; -import { - RunChangeNotifier, - type RunChangeInput, - type RunChangeSubscription, -} from "./runChangeNotifier.server"; +import { RunChangeNotifier, type ChangeRecordInput } from "./runChangeNotifier.server"; /** * Process-singleton wiring for the RunChangeNotifier plus the thin, gated @@ -61,25 +57,18 @@ export function isRunChangeNotifierEnabled(): boolean { return notifierEnabled; } -/** Fire-and-forget run-changed notify. No-op (and no notifier construction) when disabled. */ -export function publishRunChanged(input: RunChangeInput): void { +/** Fire-and-forget publish of a run-changed record. No-op (and no notifier construction) + * when disabled, so publish sites can call it unconditionally. */ +export function publishChangeRecord(input: ChangeRecordInput): void { if (!notifierEnabled) { return; } getRunChangeNotifier().publish(input); } -export function publishManyRunChanged(inputs: RunChangeInput[]): void { +export function publishManyChangeRecords(inputs: ChangeRecordInput[]): void { if (!notifierEnabled) { return; } getRunChangeNotifier().publishMany(inputs); } - -/** Subscribe to the next change for a run via the shared subscriber. */ -export function subscribeToRunChanges(runId: string): RunChangeSubscription { - if (!notifierEnabled) { - throw new Error("Run change notifier is disabled"); - } - return getRunChangeNotifier().subscribeToRunChanges(runId); -} diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 3277d74ba6e..edd25ca8cde 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -20,11 +20,12 @@ import { createExceptionPropertiesFromError } from "./eventRepository/common.ser import { getEventRepositoryForStore, recordRunDebugLog } from "./eventRepository/index.server"; import { roomFromFriendlyRunId, socketIo } from "./handleSocketIo.server"; import { engine } from "./runEngine.server"; +import { publishChangeRecord } from "~/services/realtime/runChangeNotifierInstance.server"; import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server"; import { TaskRunErrorCodes } from "@trigger.dev/core/v3"; export function registerRunEngineEventBusHandlers() { - engine.eventBus.on("runSucceeded", async ({ time, run, organization }) => { + engine.eventBus.on("runSucceeded", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( $replica.taskRun.findFirstOrThrow({ where: { @@ -45,6 +46,11 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read so the + // per-env channel carries the membership keys (no separate query). No-op when + // the notifier is disabled. + runTags: true, + batchId: true, }, }) ); @@ -57,6 +63,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( run.taskEventStore, taskRun.organizationId ?? organization.id @@ -91,7 +104,7 @@ export function registerRunEngineEventBusHandlers() { }); // Handle events - engine.eventBus.on("runFailed", async ({ time, run, organization }) => { + engine.eventBus.on("runFailed", async ({ time, run, organization, environment }) => { const sanitizedError = sanitizeError(run.error); const exception = createExceptionPropertiesFromError(sanitizedError); @@ -115,6 +128,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -127,6 +144,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( run.taskEventStore, taskRun.organizationId ?? organization.id @@ -172,6 +196,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -184,6 +212,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: taskRun.runtimeEnvironmentId, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + if (!taskRun.organizationId) { logger.error("[runAttemptFailed] Task run has no organization id", { runId: run.id, @@ -328,7 +363,7 @@ export function registerRunEngineEventBusHandlers() { } ); - engine.eventBus.on("runExpired", async ({ time, run, organization }) => { + engine.eventBus.on("runExpired", async ({ time, run, organization, environment }) => { if (!run.ttl) { return; } @@ -353,6 +388,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -365,6 +404,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( taskRun.taskEventStore, taskRun.organizationId ?? organization.id @@ -386,7 +432,7 @@ export function registerRunEngineEventBusHandlers() { } }); - engine.eventBus.on("runCancelled", async ({ time, run, organization }) => { + engine.eventBus.on("runCancelled", async ({ time, run, organization, environment }) => { const [taskRunError, taskRun] = await tryCatch( $replica.taskRun.findFirstOrThrow({ where: { @@ -407,6 +453,10 @@ export function registerRunEngineEventBusHandlers() { isTest: true, organizationId: true, taskEventStore: true, + // Piggyback the realtime run-changed publish on this existing read (no-op when + // the notifier is disabled). + runTags: true, + batchId: true, }, }) ); @@ -419,6 +469,13 @@ export function registerRunEngineEventBusHandlers() { return; } + publishChangeRecord({ + runId: taskRun.id, + envId: environment.id, + tags: taskRun.runTags, + batchId: taskRun.batchId, + }); + const eventRepository = await getEventRepositoryForStore( taskRun.taskEventStore, taskRun.organizationId ?? organization.id @@ -505,15 +562,21 @@ export function registerRunEngineEventBusHandlers() { }); engine.eventBus.on("runMetadataUpdated", async ({ time, run }) => { - const env = await findEnvironmentFromRun(run.id); + const result = await findEnvironmentFromRun(run.id); - if (!env) { + if (!result) { logger.error("[runMetadataUpdated] Failed to find environment", { runId: run.id }); return; } + const { environment, runTags, batchId } = result; + + // Realtime run-changed publish: a full record (env + tags + batchId all from the one + // read above), so tag/batch feeds route by index instead of hydrate-to-classify. + publishChangeRecord({ runId: run.id, envId: environment.id, tags: runTags, batchId }); + try { - await updateMetadataService.call(run.id, run.metadata, env); + await updateMetadataService.call(run.id, run.metadata, environment); } catch (e) { if (e instanceof MetadataTooLargeError) { logger.warn("[runMetadataUpdated] Failed to update metadata, too large", { diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts new file mode 100644 index 00000000000..befe0356284 --- /dev/null +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -0,0 +1,187 @@ +import { describe, expect, it, vi } from "vitest"; +import { + EnvChangeRouter, + type EnvChangeSource, + type RowHydrator, +} from "~/services/realtime/envChangeRouter.server"; +import { type ChangeRecord } from "~/services/realtime/runChangeNotifier.server"; +import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; + +const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + +function row( + id: string, + opts: { tags?: string[]; createdAtMs?: number; updatedAtMs?: number } = {} +): RealtimeRunRow { + return { + id, + runTags: opts.tags ?? [], + createdAt: new Date(opts.createdAtMs ?? FLOOR_MS + 1_000), + updatedAt: new Date(opts.updatedAtMs ?? FLOOR_MS + 5_000), + } as unknown as RealtimeRunRow; +} + +function record(runId: string, extra: Partial = {}): ChangeRecord { + return { v: 1, runId, envId: "env_1", ...extra }; +} + +/** A controllable EnvChangeSource: tests push batches to the env's listener. */ +function fakeSource() { + const listeners = new Map void>>(); + const source: EnvChangeSource = { + subscribeToEnv(envId, onBatch) { + let set = listeners.get(envId); + if (!set) { + set = new Set(); + listeners.set(envId, set); + } + set.add(onBatch); + return () => { + listeners.get(envId)?.delete(onBatch); + }; + }, + }; + return { + source, + push(envId: string, records: ChangeRecord[]) { + for (const l of listeners.get(envId) ?? []) l(records); + }, + isSubscribed(envId: string) { + return (listeners.get(envId)?.size ?? 0) > 0; + }, + }; +} + +function makeRouter(rowsById: Map = new Map()) { + const src = fakeSource(); + const hydrateSpy = vi.fn(async (_env, ids) => + ids.map((id) => rowsById.get(id)).filter((r): r is RealtimeRunRow => Boolean(r)) + ); + const router = new EnvChangeRouter({ source: src.source, hydrator: { hydrateByIds: hydrateSpy } }); + return { router, src, hydrateSpy }; +} + +describe("EnvChangeRouter", () => { + it("routes a tag match to the feed (hydrated + serialized) and ignores non-matches", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const wait = reg.waitForMatch(undefined, 1_000); + + // A non-matching tag is dropped (no wake); a matching tag wakes with the hydrated row. + src.push("env_1", [record("rX", { tags: ["b"] }), record("r1", { tags: ["a"] })]); + + const result = await wait; + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + expect(result.rows[0].value.id).toBe("r1"); // serialized wire value + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["r1"], []); + reg.close(); + }); + + it("batch-hydrates ONCE and shares the serialized value across feeds matching the same run", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const regs = [ + router.register("env_1", { kind: "tag", tags: ["a"] }, []), + router.register("env_1", { kind: "tag", tags: ["a"] }, []), + ]; + const waits = regs.map((r) => r.waitForMatch(undefined, 1_000)); + + src.push("env_1", [record("r1", { tags: ["a"] })]); + const results = await Promise.all(waits); + + // One hydrate for the whole tick (same column set), shared by both feeds... + expect(hydrateSpy).toHaveBeenCalledTimes(1); + // ...and the same serialized value object is reused (serialize-once). + expect(results[0].rows[0].value).toBe(results[1].rows[0].value); + regs.forEach((r) => r.close()); + }); + + it("routes a run feed by exact runId", async () => { + const rows = new Map([["r1", row("r1")]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "run", runId: "r1" }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [record("r2"), record("r1")]); + const result = await wait; + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + }); + + it("routes a batch feed by batchId", async () => { + const rows = new Map([["r1", row("r1")]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "batch", batchId: "batch_1" }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [ + record("rX", { batchId: "other" }), + record("r1", { batchId: "batch_1" }), + ]); + const result = await wait; + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + }); + + it("drops a tag match created before the feed's createdAt floor", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"], createdAtMs: FLOOR_MS - 10_000 })]]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"], createdAtFloorMs: FLOOR_MS }, []); + let settled = false; + const wait = reg.waitForMatch(undefined, 60).then((r) => { + settled = true; + return r; + }); + src.push("env_1", [record("r1", { tags: ["a"], createdAtMs: FLOOR_MS - 10_000 })]); + // Hydrated but out-of-window -> not woken; falls through to the timeout. + const result = await wait; + expect(settled).toBe(true); + expect(result.reason).toBe("timeout"); + reg.close(); + }); + + it("classifies a partial record (no tags) by hydrating and re-checking the row's tags", async () => { + // Partial record routes to all tag feeds as candidates; the authoritative row decides. + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src } = makeRouter(rows); + const match = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const noMatch = router.register("env_1", { kind: "tag", tags: ["z"] }, []); + const matchWait = match.waitForMatch(undefined, 1_000); + let noMatchSettled = false; + const noMatchWait = noMatch.waitForMatch(undefined, 80).then((r) => { + noMatchSettled = true; + return r; + }); + + src.push("env_1", [record("r1", { tags: undefined })]); // partial: tags absent + + expect((await matchWait).rows.map((m) => m.row.id)).toEqual(["r1"]); + expect((await noMatchWait).reason).toBe("timeout"); // row tags ["a"] don't intersect ["z"] + expect(noMatchSettled).toBe(true); + match.close(); + noMatch.close(); + }); + + it("times out and aborts cleanly", async () => { + const { router, src } = makeRouter(); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + expect((await reg.waitForMatch(undefined, 30)).reason).toBe("timeout"); + + const controller = new AbortController(); + const wait = reg.waitForMatch(controller.signal, 5_000); + controller.abort(); + expect((await wait).reason).toBe("abort"); + reg.close(); + expect(src.isSubscribed("env_1")).toBe(false); // last feed left -> unsubscribed + }); + + it("only routes to feeds currently waiting (gaps between polls fall to the backstop)", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + // Not waiting yet: a push is dropped (no hydrate, no buffering). + src.push("env_1", [record("r1", { tags: ["a"] })]); + expect(hydrateSpy).not.toHaveBeenCalled(); + reg.close(); + }); +}); diff --git a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts index d429cb3f8de..c9976e96678 100644 --- a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts +++ b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts @@ -1,66 +1,86 @@ +import { setTimeout as sleep } from "node:timers/promises"; import { CURRENT_API_VERSION } from "~/api/versions"; import { NotifierRealtimeClient, type RealtimeListEnvironment, } from "~/services/realtime/notifierRealtimeClient.server"; import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { + EnvChangeRouter, + type EnvChangeSource, +} from "~/services/realtime/envChangeRouter.server"; +import { type ChangeRecord } from "~/services/realtime/runChangeNotifier.server"; import { describe, expect, it, vi } from "vitest"; const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; -// Fixed offset floor so a row's updatedAt being above/below it deterministically -// produces a delta / empty diff. +// Fixed offset floor: a row's updatedAt above/below it produces a delta / empty diff. The +// createdAt window resolves to this same floor (large maximumCreatedAtFilterAgeMs below). const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); -function row(id: string, updatedAtMs: number): RealtimeRunRow { +function row( + id: string, + updatedAtMs: number, + opts: { createdAtMs?: number; tags?: string[] } = {} +): RealtimeRunRow { return { id, - createdAt: new Date("2026-06-07T09:00:00.000Z"), + runTags: opts.tags ?? ["t"], + createdAt: new Date(opts.createdAtMs ?? FLOOR_MS + 1_000), updatedAt: new Date(updatedAtMs), } as unknown as RealtimeRunRow; } -/** A notifier whose env wakes are driven manually via wake(). Each live-poll loop - * iteration subscribes once (one-shot), so wake() releases exactly one iteration. */ -function controllableNotifier() { - const pending: Array<() => void> = []; - return { - subscribeToRunChanges: () => ({ changed: new Promise(() => {}), unsubscribe() {} }), - subscribeToEnvChanges: () => { - let resolve!: () => void; - const changed = new Promise((r) => { - resolve = r; - }); - pending.push(resolve); - return { changed, unsubscribe() {} }; - }, - wake() { - pending.shift()?.(); +function rec(runId: string, extra: Partial = {}): ChangeRecord { + return { v: 1, runId, envId: "env_1", ...extra }; +} + +/** A controllable EnvChangeSource the test pushes batches into. */ +function fakeSource() { + const listeners = new Map void>>(); + const source: EnvChangeSource = { + subscribeToEnv(envId, onBatch) { + let set = listeners.get(envId); + if (!set) { + set = new Set(); + listeners.set(envId, set); + } + set.add(onBatch); + return () => listeners.get(envId)?.delete(onBatch); }, - pending() { - return pending.length; + }; + return { + source, + push: (envId: string, records: ChangeRecord[]) => { + for (const l of listeners.get(envId) ?? []) l(records); }, + isSubscribed: (envId: string) => (listeners.get(envId)?.size ?? 0) > 0, }; } -function makeClient(notifier: unknown, overrides: Record = {}) { +function makeClient(overrides: Record = {}) { let rowsToReturn: RealtimeRunRow[] = []; - const hydrateSpy = vi.fn(async () => rowsToReturn); + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => + rowsToReturn.filter((r) => ids.includes(r.id)) + ); + const resolveSpy = vi.fn(async () => rowsToReturn.map((r) => r.id)); + const src = fakeSource(); + const router = new EnvChangeRouter({ source: src.source, hydrator: { hydrateByIds: hydrateSpy } }); const client = new NotifierRealtimeClient({ runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, - runListResolver: { resolveMatchingRunIds: async () => ["run_1"] } as any, - notifier: notifier as any, + runListResolver: { resolveMatchingRunIds: resolveSpy } as any, + router, limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, cachedLimitProvider: { getCachedLimit: async () => 100 }, - maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, - // Disable the resolve cache so each held iteration re-hydrates the latest rows. + // Large so the recovered createdAt floor isn't clamped past FLOOR_MS. + maximumCreatedAtFilterAgeMs: 100 * 365 * 24 * 60 * 60 * 1000, runSetResolveCacheTtlMs: 0, livePollTimeoutMs: 10_000, ...overrides, }); - return { client, hydrateSpy, setRows: (rows: RealtimeRunRow[]) => (rowsToReturn = rows) }; + return { client, src, hydrateSpy, resolveSpy, setRows: (rows: RealtimeRunRow[]) => (rowsToReturn = rows) }; } function liveRuns(client: NotifierRealtimeClient) { @@ -74,63 +94,98 @@ function liveRuns(client: NotifierRealtimeClient) { ); } +async function whenWaiting(src: ReturnType) { + // Subscribed (feed registered) + a tick so waitForMatch has armed feed.resolve. + await vi.waitFor(() => expect(src.isSubscribed("env_1")).toBe(true)); + await sleep(15); +} + async function bodyOf(res: Response) { - return JSON.parse(await res.text()) as Array<{ headers?: { control?: string; operation?: string }; value?: unknown }>; + return JSON.parse(await res.text()) as Array<{ + headers?: { control?: string; operation?: string }; + value?: unknown; + }>; } const hasRowOp = (body: Awaited>) => body.some((m) => m?.headers?.operation || (m && typeof m === "object" && "value" in m)); const isUpToDate = (body: Awaited>) => body.some((m) => m?.headers?.control === "up-to-date"); -describe("NotifierRealtimeClient lever A (hold-on-empty)", () => { - it("holds the long-poll on an empty diff and only responds when a real delta arrives", async () => { - const notifier = controllableNotifier(); - const { client, hydrateSpy, setRows } = makeClient(notifier); - setRows([row("run_1", FLOOR_MS - 1_000)]); // older than the floor -> empty diff +describe("NotifierRealtimeClient multi-run live path over the router", () => { + it("a matching change hydrates by id (no ClickHouse) and returns a delta", async () => { + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient(); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); + + const responsePromise = liveRuns(client); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t", "x"] })]); + + const res = await responsePromise; + expect(res.status).toBe(200); + expect(hasRowOp(await bodyOf(res))).toBe(true); + expect(resolveSpy).not.toHaveBeenCalled(); // ClickHouse skipped + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["run_1"], expect.anything()); + }); + + it("a change that doesn't match the filter never wakes the feed (no CH, no PG); a later match does", async () => { + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient(); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); const responsePromise = liveRuns(client); let settled = false; void responsePromise.then(() => (settled = true)); + await whenWaiting(src); - // Feed subscribed and is waiting. - await vi.waitFor(() => expect(notifier.pending()).toBe(1)); - - // An irrelevant change wakes the env channel, but this feed's diff is empty. - notifier.wake(); - // It must re-subscribe and keep holding (no response yet), having refetched once. - await vi.waitFor(() => expect(notifier.pending()).toBe(1)); + src.push("env_1", [rec("run_x", { tags: ["other"] })]); // doesn't intersect ["t"] + await sleep(50); expect(settled).toBe(false); - expect(hydrateSpy).toHaveBeenCalledTimes(1); - - // A relevant change: a row advances past the floor. - setRows([row("run_1", FLOOR_MS + 5_000)]); - notifier.wake(); + expect(hydrateSpy).not.toHaveBeenCalled(); // router never routed it + expect(resolveSpy).not.toHaveBeenCalled(); + src.push("env_1", [rec("run_1", { tags: ["t"] })]); const res = await responsePromise; expect(settled).toBe(true); - expect(res.status).toBe(200); expect(hasRowOp(await bodyOf(res))).toBe(true); }); - it("returns up-to-date once the backstop elapses with no relevant change", async () => { - const notifier = controllableNotifier(); - const { client } = makeClient(notifier, { livePollTimeoutMs: 50 }); - // No rows ever match; never wake -> the backstop fires and we return up-to-date. - const res = await liveRuns(client); + it("a matching run created before the window floor is hydrated but dropped (keeps holding)", async () => { + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient({ livePollTimeoutMs: 120 }); + setRows([row("run_1", FLOOR_MS + 5_000, { createdAtMs: FLOOR_MS - 10_000, tags: ["t"] })]); + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t"] })]); + + await sleep(40); + expect(settled).toBe(false); // dropped by the createdAt floor -> held + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["run_1"], expect.anything()); + expect(resolveSpy).not.toHaveBeenCalled(); + + await responsePromise; // drain via the backstop + }); + + it("the backstop timeout does a full ClickHouse resolve and returns up-to-date", async () => { + const { client, resolveSpy } = makeClient({ livePollTimeoutMs: 50 }); + const res = await liveRuns(client); // never pushed -> backstop fires expect(res.status).toBe(200); expect(isUpToDate(await bodyOf(res))).toBe(true); + expect(resolveSpy).toHaveBeenCalled(); }); - it("with holdOnEmpty=false, returns up-to-date on the first empty wake (legacy behavior)", async () => { - const notifier = controllableNotifier(); - const { client } = makeClient(notifier, { holdOnEmpty: false }); + it("with holdOnEmpty=false, a matched-but-not-advanced change returns up-to-date without ClickHouse", async () => { + const { client, src, resolveSpy, setRows } = makeClient({ holdOnEmpty: false }); + // Matches the tag and is in-window, but updatedAt is at/below the offset floor -> no delta. + setRows([row("run_1", FLOOR_MS - 1_000, { tags: ["t"] })]); const responsePromise = liveRuns(client); - await vi.waitFor(() => expect(notifier.pending()).toBe(1)); - notifier.wake(); // empty diff -> legacy path returns immediately + await whenWaiting(src); + src.push("env_1", [rec("run_1", { tags: ["t"] })]); const res = await responsePromise; expect(res.status).toBe(200); expect(isUpToDate(await bodyOf(res))).toBe(true); + expect(resolveSpy).not.toHaveBeenCalled(); }); }); diff --git a/apps/webapp/test/realtime/notifierRealtimeClient.test.ts b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts index fb3349e0e62..5f7b96fc099 100644 --- a/apps/webapp/test/realtime/notifierRealtimeClient.test.ts +++ b/apps/webapp/test/realtime/notifierRealtimeClient.test.ts @@ -4,6 +4,7 @@ import { type RealtimeListEnvironment, } from "~/services/realtime/notifierRealtimeClient.server"; import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; import { describe, expect, it } from "vitest"; function sampleRow(): RealtimeRunRow { @@ -40,17 +41,17 @@ function sampleRow(): RealtimeRunRow { // Only the initial-snapshot path is exercised here, which touches the shared // #buildResponse — enough to lock the response-header contract. function makeClient(row: RealtimeRunRow | null) { - const never = { changed: new Promise(() => {}), unsubscribe() {} }; return new NotifierRealtimeClient({ runReader: { getRunById: async () => row, hydrateByIds: async () => (row ? [row] : []), } as any, runListResolver: { resolveMatchingRunIds: async () => [] } as any, - notifier: { - subscribeToRunChanges: () => never, - subscribeToEnvChanges: () => never, - } as any, + // Snapshot path only; the router (over a no-op source) is never invoked here. + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: async () => (row ? [row] : []) }, + }), limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, cachedLimitProvider: { getCachedLimit: async () => 100 }, maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts index 2f325296f1c..6d4fd61e1ab 100644 --- a/apps/webapp/test/realtime/notifierRunSetCache.test.ts +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -4,6 +4,8 @@ import { type RealtimeListEnvironment, } from "~/services/realtime/notifierRealtimeClient.server"; import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; +import { setTimeout as sleep } from "node:timers/promises"; import { describe, expect, it, vi } from "vitest"; const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; @@ -20,12 +22,16 @@ function row(id: string): RealtimeRunRow { function makeClient(overrides: Record = {}) { const resolveSpy = vi.fn(async () => ["run_1", "run_2"]); const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); - const never = { changed: new Promise(() => {}), unsubscribe() {} }; const client = new NotifierRealtimeClient({ runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, runListResolver: { resolveMatchingRunIds: resolveSpy } as any, - notifier: { subscribeToRunChanges: () => never, subscribeToEnvChanges: () => never } as any, + // No-op source: live polls never get a router wake, so they fall through to the + // backstop full-resolve — which is what the live tests below assert on. + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: hydrateSpy }, + }), limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, cachedLimitProvider: { getCachedLimit: async () => 100 }, maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, @@ -123,6 +129,84 @@ describe("NotifierRealtimeClient run-set resolve coalescing + cache", () => { }); }); +describe("NotifierRealtimeClient resolve admission gate (mass-reconnect stampede)", () => { + // A resolver that blocks each invocation until released, so we can watch how many run + // concurrently. Tracks peak concurrency and exposes a release-one-at-a-time drain. + function gatedResolver() { + let active = 0; + let peak = 0; + const releases: Array<() => void> = []; + const resolve = vi.fn(async () => { + active++; + peak = Math.max(peak, active); + await new Promise((r) => releases.push(r)); + active--; + return ["run_1"]; + }); + return { + resolve, + peak: () => peak, + releaseOne: () => releases.shift()?.(), + waiting: () => releases.length, + }; + } + + function makeGatedClient(resolveAdmissionLimit: number, resolver: ReturnType) { + const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); + return new NotifierRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, + runListResolver: { resolveMatchingRunIds: resolver.resolve } as any, + router: new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: hydrateSpy }, + }), + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 0, // no cache -> every distinct filter is a fresh resolve + resolveAdmissionLimit, + }); + } + + it("throttles a distinct-filter stampede to the admission limit of concurrent CH resolves", async () => { + const resolver = gatedResolver(); + const client = makeGatedClient(2, resolver); + + // 5 distinct batchIds => 5 distinct filters => 5 fresh resolves, fired at once. + const polls = [0, 1, 2, 3, 4].map((i) => snapshot(client, `batch_${i}`)); + + // Only the limit (2) may run concurrently; the rest queue for a permit. + await vi.waitFor(() => expect(resolver.resolve).toHaveBeenCalledTimes(2)); + await sleep(20); + expect(resolver.resolve).toHaveBeenCalledTimes(2); // 3 still queued behind the gate + expect(resolver.peak()).toBe(2); + + // Drain: each release frees a permit, admitting exactly one queued resolve. + while (resolver.waiting() > 0) { + resolver.releaseOne(); + await sleep(5); + } + await Promise.all(polls); + + expect(resolver.resolve).toHaveBeenCalledTimes(5); // all ran... + expect(resolver.peak()).toBe(2); // ...but never more than the limit at once + }); + + it("lets a same-filter burst through on a single permit (coalesces before the gate)", async () => { + const resolver = gatedResolver(); + const client = makeGatedClient(1, resolver); // limit 1 would deadlock if each took a permit + + // 5 identical filters fired at once -> single-flight collapses to one in-flight resolve. + const polls = [0, 1, 2, 3, 4].map(() => snapshot(client, "batch_same")); + await vi.waitFor(() => expect(resolver.resolve).toHaveBeenCalledTimes(1)); + await sleep(20); + + resolver.releaseOne(); + await Promise.all(polls); + expect(resolver.resolve).toHaveBeenCalledTimes(1); // one resolve, one permit, no queue + }); +}); + describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { it("floors the resolved createdAt lower bound to the bucket boundary", async () => { // Fix the clock to a non-bucket-aligned instant so the assertion is deterministic. @@ -180,13 +264,13 @@ describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { }); describe("NotifierRealtimeClient review fixes", () => { - const ready = { changed: Promise.resolve(), unsubscribe() {} }; - const liveNotifier = { subscribeToRunChanges: () => ready, subscribeToEnvChanges: () => ready }; + // makeClient's router has a no-op source, so the live poll never gets a wake and falls + // through to its backstop timeout — the full ClickHouse resolve these tests assert on + // (createdAt clamp / concurrency limit). it("clamps a stale/crafted handle's createdAt up to the max-age floor", async () => { const maxAge = 24 * 60 * 60 * 1000; const { client, resolveSpy } = makeClient({ - notifier: liveNotifier, maximumCreatedAtFilterAgeMs: maxAge, runSetCreatedAtBucketMs: 0, livePollTimeoutMs: 50, @@ -209,7 +293,6 @@ describe("NotifierRealtimeClient review fixes", () => { it("enforces a concurrency limit of 0 instead of failing with a 500", async () => { let limitCheckedWith: number | undefined; const { client } = makeClient({ - notifier: liveNotifier, cachedLimitProvider: { getCachedLimit: async () => 0 }, limiter: { incrementAndCheck: async (_env: string, _id: string, limit: number) => { diff --git a/apps/webapp/test/realtime/runChangeNotifier.test.ts b/apps/webapp/test/realtime/runChangeNotifier.test.ts index b6c43e05544..96d7fd56a45 100644 --- a/apps/webapp/test/realtime/runChangeNotifier.test.ts +++ b/apps/webapp/test/realtime/runChangeNotifier.test.ts @@ -1,7 +1,12 @@ import { redisTest } from "@internal/testcontainers"; import { setTimeout as sleep } from "node:timers/promises"; -import { describe, expect, vi } from "vitest"; -import { RunChangeNotifier } from "~/services/realtime/runChangeNotifier.server"; +import { describe, expect, it, vi } from "vitest"; +import { + type ChangeRecord, + decodeChangeRecord, + encodeChangeRecord, + RunChangeNotifier, +} from "~/services/realtime/runChangeNotifier.server"; function toRedisOptions(redisOptions: { host?: string; port?: number; password?: string }) { return { @@ -18,92 +23,28 @@ const SUBSCRIBE_SETTLE_MS = 250; describe("RunChangeNotifier", () => { redisTest( - "delivers a published change to a subscriber", + "delivers a published change to an env subscriber", { timeout: 30_000 }, async ({ redisOptions }) => { const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); try { - const subscription = notifier.subscribeToRunChanges("run_1"); + const received: ChangeRecord[] = []; + const unsubscribe = notifier.subscribeToEnv("env_1", (records) => received.push(...records)); expect(notifier.activeSubscriptionCount).toBe(1); - let resolved = false; - void subscription.changed.then(() => { - resolved = true; - }); - - await sleep(SUBSCRIBE_SETTLE_MS); - notifier.publish({ runId: "run_1" }); - - await vi.waitFor(() => expect(resolved).toBe(true), { timeout: 5_000, interval: 50 }); - - subscription.unsubscribe(); - // Cleanup is deferred until Redis confirms UNSUBSCRIBE (avoids a - // subscribe/unsubscribe race), so the count converges to 0 asynchronously. - await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { - timeout: 5_000, - interval: 50, - }); - } finally { - await notifier.quit(); - } - } - ); - - redisTest( - "does not wake a subscriber for a different run", - { timeout: 30_000 }, - async ({ redisOptions }) => { - const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); - try { - const subscription = notifier.subscribeToRunChanges("run_a"); - let resolved = false; - void subscription.changed.then(() => { - resolved = true; - }); - - await sleep(SUBSCRIBE_SETTLE_MS); - notifier.publish({ runId: "run_b" }); - await sleep(500); - - expect(resolved).toBe(false); - subscription.unsubscribe(); - } finally { - await notifier.quit(); - } - } - ); - - redisTest( - "refcounts subscriptions per run and wakes all waiters", - { timeout: 30_000 }, - async ({ redisOptions }) => { - const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); - try { - const first = notifier.subscribeToRunChanges("run_x"); - const second = notifier.subscribeToRunChanges("run_x"); - - // Two waiters, one distinct channel. - expect(notifier.activeSubscriptionCount).toBe(1); - - let firstResolved = false; - let secondResolved = false; - void first.changed.then(() => (firstResolved = true)); - void second.changed.then(() => (secondResolved = true)); - await sleep(SUBSCRIBE_SETTLE_MS); - notifier.publish({ runId: "run_x" }); + notifier.publish({ runId: "run_1", envId: "env_1", tags: ["a"], batchId: "batch_1" }); - await vi.waitFor(() => expect(firstResolved && secondResolved).toBe(true), { + await vi.waitFor(() => expect(received.some((r) => r.runId === "run_1")).toBe(true), { timeout: 5_000, interval: 50, }); + const got = received.find((r) => r.runId === "run_1")!; + expect(got.tags).toEqual(["a"]); + expect(got.batchId).toBe("batch_1"); - // Channel stays until the last waiter unsubscribes. Dropping one waiter only - // shrinks the listener set (no UNSUBSCRIBE), so the count is still 1 synchronously. - first.unsubscribe(); - expect(notifier.activeSubscriptionCount).toBe(1); - // The last unsubscribe issues UNSUBSCRIBE; the channel is dropped once Redis confirms. - second.unsubscribe(); + unsubscribe(); + // Cleanup is deferred until Redis confirms UNSUBSCRIBE, so the count converges to 0. await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { timeout: 5_000, interval: 50, @@ -115,65 +56,19 @@ describe("RunChangeNotifier", () => { ); redisTest( - "publish with no subscribers is a harmless no-op", - { timeout: 30_000 }, - async ({ redisOptions }) => { - const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); - try { - expect(() => notifier.publish({ runId: "nobody_listening" })).not.toThrow(); - } finally { - await notifier.quit(); - } - } - ); - - redisTest( - "wakes an env subscriber when a run in that env changes (tag-list feed)", + "does not deliver a change for a different env", { timeout: 30_000 }, async ({ redisOptions }) => { const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); try { - const envSub = notifier.subscribeToEnvChanges("env_1"); - let envWoke = false; - void envSub.changed.then(() => { - envWoke = true; - }); + const received: ChangeRecord[] = []; + notifier.subscribeToEnv("env_a", (records) => received.push(...records)); await sleep(SUBSCRIBE_SETTLE_MS); - // A run change WITH an environmentId fans out to the per-env channel. - notifier.publish({ runId: "run_1", environmentId: "env_1" }); - - await vi.waitFor(() => expect(envWoke).toBe(true), { timeout: 5_000, interval: 50 }); - envSub.unsubscribe(); - await vi.waitFor(() => expect(notifier.activeSubscriptionCount).toBe(0), { - timeout: 5_000, - interval: 50, - }); - } finally { - await notifier.quit(); - } - } - ); - - redisTest( - "does not wake an env subscriber for a different env, nor when env is omitted", - { timeout: 30_000 }, - async ({ redisOptions }) => { - const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); - try { - const envSub = notifier.subscribeToEnvChanges("env_a"); - let envWoke = false; - void envSub.changed.then(() => { - envWoke = true; - }); - - await sleep(SUBSCRIBE_SETTLE_MS); - notifier.publish({ runId: "run_1", environmentId: "env_b" }); // different env - notifier.publish({ runId: "run_2" }); // no env -> per-run channel only + notifier.publish({ runId: "run_1", envId: "env_b", tags: [] }); // different env await sleep(500); - expect(envWoke).toBe(false); - envSub.unsubscribe(); + expect(received).toHaveLength(0); } finally { await notifier.quit(); } @@ -181,77 +76,36 @@ describe("RunChangeNotifier", () => { ); redisTest( - "re-subscribing right after the last unsubscribe still delivers", + "coalesces a burst of env publishes into far fewer batches than publishes (lossless)", { timeout: 30_000 }, async ({ redisOptions }) => { - const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions) }); - try { - const first = notifier.subscribeToRunChanges("run_race"); - await sleep(SUBSCRIBE_SETTLE_MS); - - // Drop the last waiter (issues UNSUBSCRIBE) and immediately re-subscribe before - // it can settle. The channel must end up subscribed so the new waiter wakes. - first.unsubscribe(); - const second = notifier.subscribeToRunChanges("run_race"); - let woke = false; - void second.changed.then(() => { - woke = true; - }); - - await sleep(SUBSCRIBE_SETTLE_MS); - notifier.publish({ runId: "run_race" }); - - await vi.waitFor(() => expect(woke).toBe(true), { timeout: 5_000, interval: 50 }); - second.unsubscribe(); - } finally { - await notifier.quit(); - } - } - ); - - redisTest( - "coalesces a burst of env publishes into far fewer wakes than publishes", - { timeout: 30_000 }, - async ({ redisOptions }) => { - // A busy env's run-change firehose must not wake feeds once per publication. const notifier = new RunChangeNotifier({ redis: toRedisOptions(redisOptions), envWakeCoalesceWindowMs: 100, }); try { - // Count wakes by continuously re-subscribing (each subscription is one-shot). - let wakes = 0; - let running = true; - const counter = (async () => { - while (running) { - const sub = notifier.subscribeToEnvChanges("env_burst"); - let woke = false; - void sub.changed.then(() => (woke = true)).catch(() => {}); - const start = Date.now(); - while (!woke && running && Date.now() - start < 1_500) { - await sleep(5); - } - sub.unsubscribe(); - if (woke) wakes++; - else break; - } - })(); + let batches = 0; + const runIds = new Set(); + notifier.subscribeToEnv("env_burst", (records) => { + batches++; + for (const r of records) runIds.add(r.runId); + }); await sleep(SUBSCRIBE_SETTLE_MS); - // Publish ~200/s for a second to the same env channel. let pubs = 0; const end = Date.now() + 1_000; while (Date.now() < end) { - notifier.publish({ runId: `r${pubs++}`, environmentId: "env_burst" }); + notifier.publish({ runId: `r${pubs++}`, envId: "env_burst", tags: [] }); await sleep(5); } - running = false; - await counter; + await sleep(300); expect(pubs).toBeGreaterThan(100); - expect(wakes).toBeGreaterThanOrEqual(1); // leading edge still delivers - // Leading-edge throttle caps wakes to ~time/window, well below the publish count. - expect(wakes).toBeLessThan(pubs / 4); + expect(batches).toBeGreaterThanOrEqual(1); + // Leading-edge throttle: far fewer deliveries than publishes... + expect(batches).toBeLessThan(pubs / 4); + // ...but lossless — the batch accumulates every run that changed in the window. + expect(runIds.size).toBeGreaterThan(pubs / 2); } finally { await notifier.quit(); } @@ -259,11 +113,10 @@ describe("RunChangeNotifier", () => { ); // Sharded pub/sub (SSUBSCRIBE/SPUBLISH/smessage) wiring — validated end to end on a - // single node (Redis 7.2 accepts these commands and delivers same-node). Multi-shard - // ROUTING needs a real cluster (covered by the cluster fixture), but this proves the - // notifier's sharded command + event path is correct. + // single node (Redis 7.2 accepts these and delivers same-node). Multi-shard ROUTING + // needs a real cluster (the cluster fixture covers that); this proves the command path. redisTest( - "delivers via sharded pub/sub on the per-run channel", + "delivers via sharded pub/sub on the env channel", { timeout: 30_000 }, async ({ redisOptions }) => { const notifier = new RunChangeNotifier({ @@ -271,46 +124,49 @@ describe("RunChangeNotifier", () => { shardedPubSub: true, }); try { - const subscription = notifier.subscribeToRunChanges("run_sharded"); - let resolved = false; - void subscription.changed.then(() => { - resolved = true; - }); + const received: ChangeRecord[] = []; + notifier.subscribeToEnv("env_sharded", (records) => received.push(...records)); await sleep(SUBSCRIBE_SETTLE_MS); - notifier.publish({ runId: "run_sharded" }); + notifier.publish({ runId: "run_1", envId: "env_sharded", tags: ["a"] }); - await vi.waitFor(() => expect(resolved).toBe(true), { timeout: 5_000, interval: 50 }); - subscription.unsubscribe(); + await vi.waitFor(() => expect(received.some((r) => r.runId === "run_1")).toBe(true), { + timeout: 5_000, + interval: 50, + }); } finally { await notifier.quit(); } } ); - redisTest( - "delivers via sharded pub/sub on the per-env channel", - { timeout: 30_000 }, - async ({ redisOptions }) => { - const notifier = new RunChangeNotifier({ - redis: toRedisOptions(redisOptions), - shardedPubSub: true, + describe("ChangeRecord codec", () => { + it("round-trips a full record (tags with a separator survive)", () => { + const encoded = encodeChangeRecord({ + v: 1, + runId: "run_1", + envId: "env_1", + tags: ["a", "b,c"], + batchId: "batch_1", }); - try { - const envSub = notifier.subscribeToEnvChanges("env_sharded"); - let envWoke = false; - void envSub.changed.then(() => { - envWoke = true; - }); - - await sleep(SUBSCRIBE_SETTLE_MS); - notifier.publish({ runId: "run_1", environmentId: "env_sharded" }); - - await vi.waitFor(() => expect(envWoke).toBe(true), { timeout: 5_000, interval: 50 }); - envSub.unsubscribe(); - } finally { - await notifier.quit(); - } - } - ); + expect(decodeChangeRecord(encoded)).toMatchObject({ + v: 1, + runId: "run_1", + envId: "env_1", + tags: ["a", "b,c"], + batchId: "batch_1", + }); + }); + + it("decodes a bare runId to a partial record (tags undefined)", () => { + // A bare/legacy frame: the consumer falls back to hydrate-to-classify. + const decoded = decodeChangeRecord("run_3"); + expect(decoded.runId).toBe("run_3"); + expect(decoded.tags).toBeUndefined(); + }); + + it("falls back to a bare runId on an unparseable message", () => { + expect(decodeChangeRecord("{not json").runId).toBe("{not json"); + }); + }); }); diff --git a/internal-packages/run-engine/src/engine/eventBus.ts b/internal-packages/run-engine/src/engine/eventBus.ts index 2e4adeed4b1..bd29869d280 100644 --- a/internal-packages/run-engine/src/engine/eventBus.ts +++ b/internal-packages/run-engine/src/engine/eventBus.ts @@ -11,7 +11,14 @@ export type EventBusEvents = { runCreated: [ { time: Date; - runId: string; + run: { + id: string; + runTags: string[]; + batchId: string | null; + }; + environment: { + id: string; + }; }, ]; runEnqueuedAfterDelay: [ @@ -23,6 +30,8 @@ export type EventBusEvents = { queuedAt: Date; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -44,6 +53,8 @@ export type EventBusEvents = { delayUntil: Date; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -76,6 +87,8 @@ export type EventBusEvents = { maxDurationInSeconds?: number; maxAttempts?: number; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -96,6 +109,8 @@ export type EventBusEvents = { status: TaskRunStatus; updatedAt: Date; createdAt: Date; + runTags: string[]; + batchId: string | null; }; organization: { id?: string; @@ -119,6 +134,8 @@ export type EventBusEvents = { attemptNumber: number; baseCostInCents: number; executedAt: Date | undefined; + runTags: string[]; + batchId: string | null; }; organization: { id: string; @@ -245,6 +262,8 @@ export type EventBusEvents = { createdAt: Date; error: TaskRunError; taskEventStore?: string; + runTags: string[]; + batchId: string | null; }; organization: { id: string; diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 835ff90cc48..c3e0a5c75d0 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -1042,7 +1042,14 @@ export class RunEngine { this.eventBus.emit("runCreated", { time: new Date(), - runId: taskRun.id, + run: { + id: taskRun.id, + runTags: taskRun.runTags, + batchId: taskRun.batchId, + }, + environment: { + id: environment.id, + }, }); return taskRun; diff --git a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts index 384384fd8c7..6c66591e288 100644 --- a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts @@ -147,6 +147,8 @@ export class CheckpointSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, @@ -308,6 +310,8 @@ export class CheckpointSystem { projectId: true, updatedAt: true, createdAt: true, + runTags: true, + batchId: true, }, }); @@ -326,6 +330,8 @@ export class CheckpointSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.organizationId ?? undefined, diff --git a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts index 32ab98bad6c..10c965741cf 100644 --- a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts @@ -79,6 +79,8 @@ export class DelayedRunSystem { delayUntil: delayUntil, updatedAt: updatedRun.updatedAt, createdAt: updatedRun.createdAt, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: snapshot.organizationId, @@ -192,6 +194,8 @@ export class DelayedRunSystem { queuedAt, updatedAt: updatedRun.updatedAt, createdAt: updatedRun.createdAt, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts index 3fe1ef072cf..7c811ebfdfc 100644 --- a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts @@ -490,6 +490,8 @@ export class DequeueSystem { maxAttempts: lockedTaskRun.maxAttempts ?? undefined, updatedAt: lockedTaskRun.updatedAt, createdAt: lockedTaskRun.createdAt, + runTags: lockedTaskRun.runTags, + batchId: lockedTaskRun.batchId, }, organization: { id: orgId, @@ -751,6 +753,8 @@ export class DequeueSystem { attemptNumber: true, updatedAt: true, createdAt: true, + runTags: true, + batchId: true, runtimeEnvironment: { select: { id: true, @@ -792,6 +796,8 @@ export class DequeueSystem { status: run.status, updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.project.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts index 6d503012fbc..b46b857f02a 100644 --- a/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/pendingVersionSystem.ts @@ -163,6 +163,8 @@ export class PendingVersionSystem { status: "PENDING", updatedAt: run.updatedAt, createdAt: run.createdAt, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: backgroundWorker.runtimeEnvironment.organizationId, diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 06c80f67f2c..02fd83a7a25 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -520,6 +520,8 @@ export class RunAttemptSystem { attemptNumber: nextAttemptNumber, baseCostInCents: updatedRun.baseCostInCents, executedAt: updatedRun.executedAt ?? undefined, + runTags: updatedRun.runTags, + batchId: updatedRun.batchId, }, organization: { id: updatedRun.runtimeEnvironment.organizationId, @@ -1052,6 +1054,8 @@ export class RunAttemptSystem { error: completion.error, createdAt: run.createdAt, taskEventStore: run.taskEventStore, + runTags: run.runTags, + batchId: run.batchId, }, organization: { id: run.runtimeEnvironment.organizationId, From 3348cfe20b96f14d6477f8be53a7b024dfd67cf8 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 9 Jun 2026 19:08:46 +0100 Subject: [PATCH 11/23] fix(webapp): wake live feeds for mid-run metadata updates by their internal run id The metadata route published the friendly run id, but the realtime router keys single-run feeds by the internal id, so a mid-run metadata.set() never reached a live feed through the fast path (only the backstop caught it). Surface the internal id from the metadata service, which already reads it, and publish that. Also widen a realtime hold-on-empty test backstop so it stops racing the timeout under slow CI. --- apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts | 10 ++++------ .../app/services/metadata/updateMetadata.server.ts | 2 ++ apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index 65cbd29c627..ab4747da46c 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -185,12 +185,10 @@ const { action } = createActionApiRoute( return json({ error: "Internal Server Error" }, { status: 500 }); } if (pgResult) { - // Mid-run metadata flush succeeded: publish a run-changed record so a live single-run - // feed reflects metadata.set() without waiting for the next lifecycle event (this - // path doesn't otherwise touch the engine event bus). envId is free; partial record, - // matched by runId. No-op when disabled. - publishChangeRecord({ runId, envId: env.id }); - return json(pgResult, { status: 200 }); + // Reflect metadata.set() on a live feed before the next lifecycle event. Publish the + // internal id (the router keys single-run feeds by it, not the friendly id from the URL). + publishChangeRecord({ runId: pgResult.runId, envId: env.id }); + return json({ metadata: pgResult.metadata }, { status: 200 }); } // PG miss. Target run is either buffered or genuinely absent. diff --git a/apps/webapp/app/services/metadata/updateMetadata.server.ts b/apps/webapp/app/services/metadata/updateMetadata.server.ts index cfb946a1024..6ea7e3a3a8d 100644 --- a/apps/webapp/app/services/metadata/updateMetadata.server.ts +++ b/apps/webapp/app/services/metadata/updateMetadata.server.ts @@ -355,6 +355,8 @@ export class UpdateMetadataService { return { metadata: newMetadata, + // Internal id, so callers can publish realtime records keyed how the router indexes feeds. + runId: taskRun.id, }; } diff --git a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts index c9976e96678..e0c51d57f52 100644 --- a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts +++ b/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts @@ -149,7 +149,8 @@ describe("NotifierRealtimeClient multi-run live path over the router", () => { }); it("a matching run created before the window floor is hydrated but dropped (keeps holding)", async () => { - const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient({ livePollTimeoutMs: 120 }); + // Generous backstop so the "still holding" assertion can't race a timeout in slow CI. + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient({ livePollTimeoutMs: 1500 }); setRows([row("run_1", FLOOR_MS + 5_000, { createdAtMs: FLOOR_MS - 10_000, tags: ["t"] })]); const responsePromise = liveRuns(client); From 3a45f3da9d2ea0c666dda91c2f5f28160bc24256 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Tue, 9 Jun 2026 23:25:37 +0100 Subject: [PATCH 12/23] fix(webapp): stop concurrent batch subscribers sharing a realtime working set Batch feeds derived their stream handle from the batchId, so two subscribers to the same batch on one instance shared a working-set cache entry and one could permanently suppress the other's deltas, including terminal status updates. Handles are now minted per connection with a collision-proof suffix and working-set entries are keyed by environment, so a client-echoed handle can never read or overwrite another connection's state. Also: mid-run metadata updates carry the batchId so batch feeds fast-wake instead of waiting for the backstop, and the run-engine metadata handler publishes after the write lands so hydration sees the new row. --- .../app/routes/api.v1.runs.$runId.metadata.ts | 2 +- .../metadata/updateMetadata.server.ts | 4 +- .../realtime/notifierRealtimeClient.server.ts | 41 +++++++++++++------ .../webapp/app/v3/runEngineHandlers.server.ts | 7 ++-- .../test/realtime/notifierRunSetCache.test.ts | 25 ++++++++++- 5 files changed, 59 insertions(+), 20 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index ab4747da46c..c88009a84a4 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -187,7 +187,7 @@ const { action } = createActionApiRoute( if (pgResult) { // Reflect metadata.set() on a live feed before the next lifecycle event. Publish the // internal id (the router keys single-run feeds by it, not the friendly id from the URL). - publishChangeRecord({ runId: pgResult.runId, envId: env.id }); + publishChangeRecord({ runId: pgResult.runId, envId: env.id, batchId: pgResult.batchId }); return json({ metadata: pgResult.metadata }, { status: 200 }); } diff --git a/apps/webapp/app/services/metadata/updateMetadata.server.ts b/apps/webapp/app/services/metadata/updateMetadata.server.ts index 6ea7e3a3a8d..6422e3c5666 100644 --- a/apps/webapp/app/services/metadata/updateMetadata.server.ts +++ b/apps/webapp/app/services/metadata/updateMetadata.server.ts @@ -308,6 +308,7 @@ export class UpdateMetadataService { }, select: { id: true, + batchId: true, completedAt: true, status: true, metadata: true, @@ -355,8 +356,9 @@ export class UpdateMetadataService { return { metadata: newMetadata, - // Internal id, so callers can publish realtime records keyed how the router indexes feeds. + // Internal id + batchId, so callers can publish realtime records keyed how the router indexes feeds. runId: taskRun.id, + batchId: taskRun.batchId, }; } diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts index 7a62fd429c9..8d5d597c65b 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts @@ -364,15 +364,11 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { clientVersion?: string, signal?: AbortSignal ): Promise { - const { offset, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); + const { offset, handle, isLive, skipColumns } = this.#parseStreamRequest(url, requestOptions); - // The batch set is fully defined by batchId (the route resolves it from the - // friendlyId on every request), so the handle is derived and stable and there's - // no createdAt window to pin. - const handle = `batch-${batchId}`; const filter: RunSetFilter = { batchId }; - if (offset !== INITIAL_OFFSET && isLive) { + if (offset !== INITIAL_OFFSET && handle && isLive) { return this.#runSetLiveResponse( environment, filter, @@ -385,11 +381,13 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { ); } - // Initial snapshot + non-live catch-up. + // Initial snapshot + non-live catch-up. The handle must be per-connection, never + // derived from the batchId: working sets are keyed by handle, and a shared handle + // lets one subscriber's emit permanently suppress the same row for another. return this.#runSetSnapshotResponse( environment, filter, - handle, + handle ?? this.#mintBatchHandle(batchId), skipColumns, apiVersion, clientVersion @@ -519,7 +517,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { seen.set(row.id, updatedAtMs); maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); } - this.#workingSetCache.set(handle, seen); + this.#workingSetCache.set(this.#workingSetKey(environment.id, handle), seen); return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { offset: encodeOffset(maxUpdatedAt, this.#nextSeq()), @@ -556,7 +554,8 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { // Working set we diff against: seeded from the cache (or the offset floor on a // miss) and advanced on each refetch within this held request. - let prevSeen = this.#workingSetCache.get(handle); + const workingSetKey = this.#workingSetKey(environment.id, handle); + let prevSeen = this.#workingSetCache.get(workingSetKey); const emitFromSerialized = (changes: SerializedRowChange[], maxUpdatedAt: number): Response => { const seq = this.#nextSeq(); @@ -614,7 +613,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { // Merge (not replace): the router only surfaced the changed subset, so keep the // rest of the working set intact. The backstop full-resolve rebuilds it. const merged = this.#mergeWorkingSet(prevSeen, touched); - this.#workingSetCache.set(handle, merged); + this.#workingSetCache.set(workingSetKey, merged); prevSeen = merged; if (changes.length > 0) { @@ -636,7 +635,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { prevSeen, offsetFloorMs ); - this.#workingSetCache.set(handle, touched); + this.#workingSetCache.set(workingSetKey, touched); prevSeen = touched; if (changes.length > 0) { @@ -881,7 +880,23 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { #mintListHandle(createdAtFilterMs: number): string { // Pins the createdAt threshold in the opaque handle so live polls reuse the // same lower bound even on a working-set cache miss. - return `runs_${Math.trunc(createdAtFilterMs)}_${this.#nextSeq()}`; + return `runs_${Math.trunc(createdAtFilterMs)}_${this.#mintUniqueSuffix()}`; + } + + #mintBatchHandle(batchId: string): string { + return `batch_${batchId}_${this.#mintUniqueSuffix()}`; + } + + #mintUniqueSuffix(): string { + // The seq alone isn't unique across instances/restarts; behind a non-sticky ALB a + // collision would land two connections on one working-set cache entry. + return `${this.#nextSeq()}_${randomUUID().slice(0, 8)}`; + } + + #workingSetKey(environmentId: string, handle: string): string { + // The handle is client-echoed; env-prefix the key so a foreign handle can never + // read or overwrite another tenant's working set. + return `${environmentId}:${handle}`; } #filterMsFromHandle(handle: string): number | undefined { diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index edd25ca8cde..7ef4efdef82 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -571,12 +571,11 @@ export function registerRunEngineEventBusHandlers() { const { environment, runTags, batchId } = result; - // Realtime run-changed publish: a full record (env + tags + batchId all from the one - // read above), so tag/batch feeds route by index instead of hydrate-to-classify. - publishChangeRecord({ runId: run.id, envId: environment.id, tags: runTags, batchId }); - try { await updateMetadataService.call(run.id, run.metadata, environment); + // Realtime run-changed publish, after the write so the router's hydrate sees the new + // row. A full record (env + tags + batchId), so feeds route by index. + publishChangeRecord({ runId: run.id, envId: environment.id, tags: runTags, batchId }); } catch (e) { if (e instanceof MetadataTooLargeError) { logger.warn("[runMetadataUpdated] Failed to update metadata, too large", { diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/notifierRunSetCache.test.ts index 6d4fd61e1ab..7a6449a9eb7 100644 --- a/apps/webapp/test/realtime/notifierRunSetCache.test.ts +++ b/apps/webapp/test/realtime/notifierRunSetCache.test.ts @@ -127,6 +127,29 @@ describe("NotifierRealtimeClient run-set resolve coalescing + cache", () => { await snapshot(client, "batch_1"); expect(results).toEqual(["miss", "hit"]); }); + + it("mints a distinct batch handle per connection and echoes a client-provided one", async () => { + const { client } = makeClient(); + // Two subscribers to the SAME batch must never share a handle (the working-set + // cache is keyed by it; sharing lets one suppress the other's deltas forever). + const res1 = await snapshot(client, "batch_1"); + const res2 = await snapshot(client, "batch_1"); + const h1 = res1.headers.get("electric-handle"); + const h2 = res2.headers.get("electric-handle"); + expect(h1).toBeTruthy(); + expect(h1).not.toBe(h2); + + // Catch-up under an existing handle keeps it. + const res3 = await client.streamBatch( + `http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&handle=${h1}`, + ENV, + "batch_1", + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + expect(res3.headers.get("electric-handle")).toBe(h1); + }); }); describe("NotifierRealtimeClient resolve admission gate (mass-reconnect stampede)", () => { @@ -304,7 +327,7 @@ describe("NotifierRealtimeClient review fixes", () => { livePollTimeoutMs: 50, }); const res = await client.streamBatch( - "http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&live=true", + "http://localhost:3030/realtime/v1/batches/batch_1?offset=123_1&live=true&handle=batch_batch_1_7_abc", ENV, "batch_1", CURRENT_API_VERSION, From 9f4c374469c4ef572caf6cded02e179f4ccbf908 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 09:04:49 +0100 Subject: [PATCH 13/23] fix(webapp): catch change-routing failures instead of crashing the process The realtime change router invoked its async routing tick fire-and-forget without a catch, so a transient database error during hydration became an unhandled promise rejection, which exits the process. Failures are now caught and logged; affected feeds fall back to their full-resolve backstop. --- .../realtime/envChangeRouter.server.ts | 13 +++++++++--- .../test/realtime/envChangeRouter.test.ts | 21 +++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts index 0c68140e58b..bba7a92452d 100644 --- a/apps/webapp/app/services/realtime/envChangeRouter.server.ts +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -1,5 +1,6 @@ import { type ChangeRecord } from "./runChangeNotifier.server"; import { type RealtimeRunRow, serializeRunRow } from "./electricStreamProtocol.server"; +import { logger } from "~/services/logger.server"; /** * EnvChangeRouter — the per-instance routing layer that turns "feeds as predicates over @@ -182,9 +183,15 @@ export class EnvChangeRouter { }; this.#envs.set(environmentId, env); env.unsubscribe = this.options.source.subscribeToEnv(environmentId, (records) => { - // Fire-and-forget; the notifier doesn't await us. Errors fall through to the feeds' - // backstop (a hydrate failure leaves waiters to time out into a full resolve). - void this.#onBatch(environmentId, env, records); + // Fire-and-forget; the notifier doesn't await us. A hydrate failure must be caught + // here (an unhandled rejection exits the process); the matched feeds' waiters stay + // armed and time out into the full-resolve backstop. + this.#onBatch(environmentId, env, records).catch((error) => { + logger.error("[envChangeRouter] failed to route a change batch", { + environmentId, + error, + }); + }); }); return env; } diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts index befe0356284..4779a702bc5 100644 --- a/apps/webapp/test/realtime/envChangeRouter.test.ts +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -98,6 +98,27 @@ describe("EnvChangeRouter", () => { regs.forEach((r) => r.close()); }); + it("a hydrate failure doesn't reject out of the source callback; the feed times out", async () => { + const src = fakeSource(); + const hydrateSpy = vi.fn(async () => { + throw new Error("replica down"); + }); + const router = new EnvChangeRouter({ + source: src.source, + hydrator: { hydrateByIds: hydrateSpy }, + }); + const reg = router.register("env_1", { kind: "run", runId: "r1" }, []); + const wait = reg.waitForMatch(undefined, 50); + + // Would be an unhandled rejection (process exit) if #onBatch's promise were unguarded. + src.push("env_1", [record("r1")]); + + const result = await wait; + expect(result.reason).toBe("timeout"); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + reg.close(); + }); + it("routes a run feed by exact runId", async () => { const rows = new Map([["r1", row("r1")]]); const { router, src } = makeRouter(rows); From 7c72cdf5b353351ddfb5fbe08377d87f330bc19a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 09:51:15 +0100 Subject: [PATCH 14/23] fix(webapp): publish full realtime change records from the metadata and tags routes A change record without tags makes the router wake every tag feed in the environment to hydrate-and-classify, so high-frequency metadata updates caused unnecessary database load; a record without a batchId skips batch feeds entirely. Both routes now publish the full membership keys, piggybacked on reads they already do. --- apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts | 7 ++++++- apps/webapp/app/routes/api.v1.runs.$runId.tags.ts | 1 + apps/webapp/app/services/metadata/updateMetadata.server.ts | 4 +++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts index c88009a84a4..d3d92d1fe5d 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.metadata.ts @@ -187,7 +187,12 @@ const { action } = createActionApiRoute( if (pgResult) { // Reflect metadata.set() on a live feed before the next lifecycle event. Publish the // internal id (the router keys single-run feeds by it, not the friendly id from the URL). - publishChangeRecord({ runId: pgResult.runId, envId: env.id, batchId: pgResult.batchId }); + publishChangeRecord({ + runId: pgResult.runId, + envId: env.id, + tags: pgResult.runTags, + batchId: pgResult.batchId, + }); return json({ metadata: pgResult.metadata }, { status: 200 }); } diff --git a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts index c8fa5ea37d2..e98d3f35823 100644 --- a/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts +++ b/apps/webapp/app/routes/api.v1.runs.$runId.tags.ts @@ -97,6 +97,7 @@ export async function action({ request, params }: ActionFunctionArgs) { runId: taskRun.id, envId: env.id, tags: existing.concat(newTags), + batchId: taskRun.batchId, }); return json({ message: `Successfully set ${newTags.length} new tags.` }, { status: 200 }); }, diff --git a/apps/webapp/app/services/metadata/updateMetadata.server.ts b/apps/webapp/app/services/metadata/updateMetadata.server.ts index 6422e3c5666..3948da046f9 100644 --- a/apps/webapp/app/services/metadata/updateMetadata.server.ts +++ b/apps/webapp/app/services/metadata/updateMetadata.server.ts @@ -309,6 +309,7 @@ export class UpdateMetadataService { select: { id: true, batchId: true, + runTags: true, completedAt: true, status: true, metadata: true, @@ -356,9 +357,10 @@ export class UpdateMetadataService { return { metadata: newMetadata, - // Internal id + batchId, so callers can publish realtime records keyed how the router indexes feeds. + // Internal id + membership keys, so callers can publish full realtime records the router routes by index. runId: taskRun.id, batchId: taskRun.batchId, + runTags: taskRun.runTags, }; } From cab6b052d0743bd4f05cbc40ad1ae9c7cdefc040 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 10:37:46 +0100 Subject: [PATCH 15/23] refactor(webapp): rename the realtime backend to native and tune it via env vars The new realtime runs backend is now named native instead of notifier: the feature flag selects electric | native | shadow, and all of its env vars share the REALTIME_BACKEND_NATIVE_ prefix, including the dedicated pub/sub Redis and ClickHouse pool configs. Remaining hardcoded tunables (live-poll jitter, working-set TTL, single-run cache, backend flag cache, default concurrency limit) moved to env vars with unchanged defaults. Backend selection reuses the org feature flags already loaded by request auth instead of an extra organization read, and its caches survive dev-mode reloads. Oversized comment blocks trimmed throughout. --- apps/webapp/app/entry.server.tsx | 4 +- apps/webapp/app/env.server.ts | 109 +++++------- .../routes/realtime.v1.batches.$batchId.ts | 3 +- .../app/routes/realtime.v1.runs.$runId.ts | 9 +- apps/webapp/app/routes/realtime.v1.runs.ts | 10 +- .../clickhouse/clickhouseFactory.server.ts | 26 +-- .../app/services/realtime/boundedTtlCache.ts | 13 +- .../clickHouseRunListResolver.server.ts | 12 +- .../realtime/electricStreamProtocol.server.ts | 62 ++----- .../realtime/envChangeRouter.server.ts | 30 +--- ...rver.ts => nativeRealtimeClient.server.ts} | 161 +++++------------- ...=> nativeRealtimeClientInstance.server.ts} | 65 +++---- .../realtimeConcurrencyLimiter.server.ts | 16 +- .../resolveRealtimeStreamClient.server.ts | 77 +++++---- .../realtime/runChangeNotifier.server.ts | 94 ++-------- .../runChangeNotifierHandlers.server.ts | 28 +-- .../runChangeNotifierInstance.server.ts | 32 ++-- .../app/services/realtime/runReader.server.ts | 44 +---- .../services/realtime/shadowCompare.server.ts | 66 +++---- .../realtime/shadowRealtimeClient.server.ts | 14 +- .../shadowRealtimeClientInstance.server.ts | 4 +- apps/webapp/app/v3/featureFlags.ts | 4 +- .../webapp/app/v3/runEngineHandlers.server.ts | 10 +- ...mpty.test.ts => nativeHoldOnEmpty.test.ts} | 10 +- ...t.test.ts => nativeRealtimeClient.test.ts} | 8 +- ...ache.test.ts => nativeRunSetCache.test.ts} | 20 +-- .../test/realtime/shadowCompare.test.ts | 18 +- 27 files changed, 325 insertions(+), 624 deletions(-) rename apps/webapp/app/services/realtime/{notifierRealtimeClient.server.ts => nativeRealtimeClient.server.ts} (79%) rename apps/webapp/app/services/realtime/{notifierRealtimeClientInstance.server.ts => nativeRealtimeClientInstance.server.ts} (66%) rename apps/webapp/test/realtime/{notifierHoldOnEmpty.test.ts => nativeHoldOnEmpty.test.ts} (96%) rename apps/webapp/test/realtime/{notifierRealtimeClient.test.ts => nativeRealtimeClient.test.ts} (95%) rename apps/webapp/test/realtime/{notifierRunSetCache.test.ts => nativeRunSetCache.test.ts} (95%) diff --git a/apps/webapp/app/entry.server.tsx b/apps/webapp/app/entry.server.tsx index 8cc23bff089..1282127cb20 100644 --- a/apps/webapp/app/entry.server.tsx +++ b/apps/webapp/app/entry.server.tsx @@ -270,8 +270,8 @@ process.on("uncaughtException", (error, origin) => { singleton("RunEngineEventBusHandlers", registerRunEngineEventBusHandlers); singleton("SetupBatchQueueCallbacks", setupBatchQueueCallbacks); -// Attach the run-changed notifier delegations to the engine event bus. -// No-ops (registers nothing) unless REALTIME_NOTIFIER_ENABLED=1. +// Attach the realtime run-changed publish delegations to the engine event bus. +// No-ops (registers nothing) unless REALTIME_BACKEND_NATIVE_ENABLED=1. singleton("RunChangeNotifierHandlers", registerRunChangeNotifierHandlers); // Wrapped in singleton() so Remix's dev-mode CJS reloads don't append diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index f01e8285916..c6844e50504 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -300,46 +300,36 @@ const EnvironmentSchema = z .int() .default(24 * 60 * 60 * 1000), // 1 day in milliseconds - // Master switch for the notifier-backed realtime feed. - // "0" (default) = the existing realtime path serves everything, publishes are - // no-ops, and no notifier Redis connections are opened (zero-overhead off). - // "1" = run-changed signals are published and the per-org `realtimeBackend` - // feature flag selects the backend per request. - REALTIME_NOTIFIER_ENABLED: z.string().default("0"), - // Backstop wait before a live notifier request refetches the run (ms). Matches - // Electric's ~20s live long-poll hold so the client polling cadence is unchanged - // across backends (a ±15% jitter is applied per request to avoid refetch herds). - REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(20_000), - // Hard cap on the tag-list snapshot size served by the notifier feed. - REALTIME_NOTIFIER_MAX_LIST_RESULTS: z.coerce.number().int().default(1_000), - // Short-TTL coalescing cache for the multi-run (tag-list/batch) resolve+hydrate. - // Concurrent same-filter feeds share one ClickHouse resolve + Postgres hydrate - // within this window, so an env-wide wake doesn't fan out into per-feed queries. - // Staleness budget: a newly-matching run is visible within ~ttl + poll interval. - REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS: z.coerce.number().int().default(1_000), - REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), - // Cap on the per-handle working-set cache (runId -> updatedAt) the notifier keeps - // for diffing multi-run live polls. - REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES: z.coerce.number().int().default(10_000), - // Quantize the tag-list createdAt lower bound to this epoch-aligned bucket (ms) so - // same-tag feeds that pin their window within the same bucket share one resolve+ - // hydrate cache entry. Floored, so the window only ever widens by < bucket. 0 - // disables bucketing (each feed keeps its exact lower bound). - REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), - // Leading-edge throttle (ms) on the per-env wake channel: a busy env's run-change - // firehose is collapsed to at most one feed-wake per window, decoupling wake load - // from run throughput. Lossless because consumers refetch current state on a wake. - // 0 disables coalescing (every change wakes immediately). - REALTIME_NOTIFIER_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(100), - // When "1", a multi-run live poll woken by a change irrelevant to its filter keeps - // holding the long-poll (re-resolving cheaply) instead of returning an empty - // up-to-date the client would immediately re-issue. "0" reverts to per-wake replies. - REALTIME_NOTIFIER_HOLD_ON_EMPTY: z.string().default("1"), - // Max concurrent fresh ClickHouse resolves (cache misses) per instance. Caps the - // distinct-filter reconnect stampede: a mass reconnect of N feeds on N different filters - // queues to this many concurrent CH queries instead of firing all N at once. Same-filter - // bursts collapse via the single-flight cache before taking a permit. 0 disables the gate. - REALTIME_NOTIFIER_RESOLVE_ADMISSION_LIMIT: z.coerce.number().int().default(16), + // Master switch for the native realtime backend; off = Electric serves everything, publishes no-op. + REALTIME_BACKEND_NATIVE_ENABLED: z.string().default("0"), + // Live long-poll backstop hold (ms); matches Electric's ~20s cadence. + REALTIME_BACKEND_NATIVE_LIVE_POLL_TIMEOUT_MS: z.coerce.number().int().default(20_000), + // Jitter ratio on the live-poll hold (0.15 = ±15%) to avoid synchronized refetch herds. + REALTIME_BACKEND_NATIVE_LIVE_POLL_JITTER_RATIO: z.coerce.number().default(0.15), + // Hard cap on the tag-list snapshot size. + REALTIME_BACKEND_NATIVE_MAX_LIST_RESULTS: z.coerce.number().int().default(1_000), + // TTL/size of the coalescing cache for the multi-run resolve+hydrate (same-filter feeds share one query). + REALTIME_BACKEND_NATIVE_RUNSET_CACHE_TTL_MS: z.coerce.number().int().default(1_000), + REALTIME_BACKEND_NATIVE_RUNSET_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), + // Size/TTL of the per-handle working-set cache used to diff multi-run live polls. + REALTIME_BACKEND_NATIVE_WORKING_SET_MAX_ENTRIES: z.coerce.number().int().default(10_000), + REALTIME_BACKEND_NATIVE_WORKING_SET_TTL_MS: z.coerce.number().int().default(300_000), + // Bucket (ms) the tag-list createdAt floor is quantized to so same-tag feeds share a cache entry; 0 disables. + REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), + // Leading-edge throttle (ms) on per-env wake delivery; 0 wakes on every change. + REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(100), + // "1" holds a multi-run live poll open on a non-matching wake instead of replying up-to-date. + REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY: z.string().default("1"), + // Max concurrent fresh ClickHouse resolves per instance (reconnect-stampede gate); 0 disables. + REALTIME_BACKEND_NATIVE_RESOLVE_ADMISSION_LIMIT: z.coerce.number().int().default(16), + // Fallback per-env concurrent-connection limit when the org has none configured. + REALTIME_BACKEND_NATIVE_DEFAULT_CONCURRENCY_LIMIT: z.coerce.number().int().default(100_000), + // TTL/size of the single-run read-through cache that collapses duplicate refetch bursts. + REALTIME_BACKEND_NATIVE_RUN_CACHE_TTL_MS: z.coerce.number().int().default(250), + REALTIME_BACKEND_NATIVE_RUN_CACHE_MAX_ENTRIES: z.coerce.number().int().default(5_000), + // TTL/size of the per-org realtimeBackend flag cache used to pick the serving backend. + REALTIME_BACKEND_FLAG_CACHE_TTL_MS: z.coerce.number().int().default(30_000), + REALTIME_BACKEND_FLAG_CACHE_MAX_ENTRIES: z.coerce.number().int().default(50_000), PUBSUB_REDIS_HOST: z .string() @@ -373,15 +363,12 @@ const EnvironmentSchema = z PUBSUB_REDIS_TLS_DISABLED: z.string().default(process.env.REDIS_TLS_DISABLED ?? "false"), PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z.string().default("0"), - // Dedicated pub/sub Redis for the realtime runs feed's run-changed notifier, so - // its publish/subscribe traffic can run on its own instance. Each value falls - // back to the shared PUBSUB_REDIS_* (then REDIS_*) when unset, so the default is - // unchanged until explicitly pointed at a dedicated instance. - REALTIME_RUNS_PUBSUB_REDIS_HOST: z + // Dedicated pub/sub Redis for the native realtime backend; falls back to PUBSUB_REDIS_* then REDIS_*. + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_HOST: z .string() .optional() .transform((v) => v ?? process.env.PUBSUB_REDIS_HOST ?? process.env.REDIS_HOST), - REALTIME_RUNS_PUBSUB_REDIS_PORT: z.coerce + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PORT: z.coerce .number() .optional() .transform((v) => { @@ -389,24 +376,22 @@ const EnvironmentSchema = z const raw = process.env.PUBSUB_REDIS_PORT ?? process.env.REDIS_PORT; return raw ? parseInt(raw) : undefined; }), - REALTIME_RUNS_PUBSUB_REDIS_USERNAME: z + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_USERNAME: z .string() .optional() .transform((v) => v ?? process.env.PUBSUB_REDIS_USERNAME ?? process.env.REDIS_USERNAME), - REALTIME_RUNS_PUBSUB_REDIS_PASSWORD: z + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PASSWORD: z .string() .optional() .transform((v) => v ?? process.env.PUBSUB_REDIS_PASSWORD ?? process.env.REDIS_PASSWORD), - REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED: z + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_TLS_DISABLED: z .string() .default(process.env.PUBSUB_REDIS_TLS_DISABLED ?? process.env.REDIS_TLS_DISABLED ?? "false"), - REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_CLUSTER_MODE_ENABLED: z .string() .default(process.env.PUBSUB_REDIS_CLUSTER_MODE_ENABLED ?? "0"), - // Use sharded pub/sub (SSUBSCRIBE/SPUBLISH) when in cluster mode, so a busy env's - // traffic stays on one shard instead of broadcasting to every node. Only takes - // effect alongside CLUSTER_MODE_ENABLED. "0" forces classic pub/sub on the cluster. - REALTIME_RUNS_PUBSUB_REDIS_SHARDED_ENABLED: z.string().default("1"), + // Use sharded pub/sub (SSUBSCRIBE/SPUBLISH) in cluster mode; "0" forces classic pub/sub. + REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_SHARDED_ENABLED: z.string().default("1"), DEFAULT_ENV_EXECUTION_CONCURRENCY_LIMIT: z.coerce.number().int().default(100), DEFAULT_ENV_EXECUTION_CONCURRENCY_BURST_FACTOR: z.coerce.number().default(1.0), @@ -1684,20 +1669,18 @@ const EnvironmentSchema = z .enum(["log", "error", "warn", "info", "debug"]) .default("info"), RUN_ENGINE_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), - // ClickHouse client used by the realtime runs feed for tag/batch id resolution. - // Kept on its own URL + pool so the feed's reads can't contend with the main - // analytics client (CLICKHOUSE_URL). Falls back to the main URL when unset. - REALTIME_RUNS_CLICKHOUSE_URL: z + // Dedicated ClickHouse pool for the native backend's tag/batch id resolution; falls back to CLICKHOUSE_URL. + REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL: z .string() .optional() .transform((v) => v ?? process.env.CLICKHOUSE_URL), - REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED: z.string().default("1"), - REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(), - REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10), - REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL: z + REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_ENABLED: z.string().default("1"), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS: z.coerce.number().int().optional(), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_MAX_OPEN_CONNECTIONS: z.coerce.number().int().default(10), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_LOG_LEVEL: z .enum(["log", "error", "warn", "info", "debug"]) .default("info"), - REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), + REALTIME_BACKEND_NATIVE_CLICKHOUSE_COMPRESSION_REQUEST: z.string().default("1"), EVENTS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(1000), EVENTS_CLICKHOUSE_FLUSH_INTERVAL_MS: z.coerce.number().int().default(1000), METRICS_CLICKHOUSE_BATCH_SIZE: z.coerce.number().int().default(10000), diff --git a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts index 973cd5f96cd..add50434d48 100644 --- a/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts +++ b/apps/webapp/app/routes/realtime.v1.batches.$batchId.ts @@ -33,8 +33,7 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: batchRun, apiVersion }) => { - // Pick the Electric proxy or the notifier-backed batch feed - // per org (defaults to Electric). Both implement streamBatch. + // Pick the Electric proxy or the native backend per org (defaults to Electric); both implement streamBatch. const client = await resolveRealtimeStreamClient(authentication.environment); return client.streamBatch( diff --git a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts index 3e224ddedf2..46118c1d894 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.$runId.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.$runId.ts @@ -48,9 +48,7 @@ export const loader = createLoaderApiRoute( }, }, async ({ authentication, request, resource: run, apiVersion }) => { - // Pick the Electric proxy or the notifier-backed shim per org (defaults to - // Electric; controlled by REALTIME_NOTIFIER_ENABLED + the realtimeBackend - // feature flag). Both implement the same streamRun contract. + // Pick the Electric proxy or the native backend per org (defaults to Electric); both implement streamRun. const client = await resolveRealtimeStreamClient(authentication.environment); return client.streamRun( @@ -60,10 +58,7 @@ export const loader = createLoaderApiRoute( apiVersion, authentication.realtime, request.headers.get("x-trigger-electric-version") ?? undefined, - // Propagate abort on client disconnect so the upstream Electric long-poll - // fetch is cancelled too. Without this, undici buffers from the unconsumed - // upstream response body accumulate until Electric's poll timeout, causing - // steady RSS growth on api (see docs/runbooks for the H1 isolation test). + // Propagate abort on client disconnect so the upstream Electric long-poll is cancelled too, else undici buffers grow RSS until the poll timeout. getRequestAbortSignal() ); } diff --git a/apps/webapp/app/routes/realtime.v1.runs.ts b/apps/webapp/app/routes/realtime.v1.runs.ts index 436f4ef48d8..2e3617800fe 100644 --- a/apps/webapp/app/routes/realtime.v1.runs.ts +++ b/apps/webapp/app/routes/realtime.v1.runs.ts @@ -25,12 +25,7 @@ export const loader = createLoaderApiRoute( authorization: { action: "read", resource: (_, __, searchParams) => - // Pre-RBAC, the resource was the searchParams object itself and - // the legacy `checkAuthorization` iterated `Object.keys`, so a - // JWT with type-level `read:tags` (no id) granted access to the - // unfiltered runs stream. Including `{ type: "tags" }` here - // preserves that — per-id `read:tags:` still grants only - // when the filter includes that tag. + // `{ type: "tags" }` preserves pre-RBAC type-level `read:tags` access to the unfiltered stream; per-id `read:tags:` still grants only when the filter includes that tag. anyResource([ { type: "runs" }, { type: "tags" }, @@ -39,8 +34,7 @@ export const loader = createLoaderApiRoute( }, }, async ({ searchParams, authentication, request, apiVersion }) => { - // Pick the Electric proxy or the notifier-backed tag-list feed per org - // (defaults to Electric). Both implement streamRuns. + // Pick the Electric proxy or the native backend per org (defaults to Electric); both implement streamRuns. const client = await resolveRealtimeStreamClient(authentication.environment); return client.streamRuns( diff --git a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts index c563621408c..794938e9807 100644 --- a/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts +++ b/apps/webapp/app/services/clickhouse/clickhouseFactory.server.ts @@ -211,7 +211,7 @@ function initializeRunEngineClickhouseClient(): ClickHouse { }); } -/** Realtime runs feed tag/batch id resolution (`REALTIME_RUNS_CLICKHOUSE_URL`); +/** Realtime runs feed tag/batch id resolution (`REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL`); * falls back to the default client if unset. */ const defaultRealtimeClickhouseClient = singleton( "realtimeClickhouseClient", @@ -219,25 +219,25 @@ const defaultRealtimeClickhouseClient = singleton( ); function initializeRealtimeClickhouseClient(): ClickHouse { - if (!env.REALTIME_RUNS_CLICKHOUSE_URL) { + if (!env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL) { return defaultClickhouseClient; } - const url = new URL(env.REALTIME_RUNS_CLICKHOUSE_URL); + const url = new URL(env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_URL); url.searchParams.delete("secure"); return new ClickHouse({ url: url.toString(), name: "realtime-runs-clickhouse", keepAlive: { - enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", - idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + enabled: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, }, - logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + logLevel: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_LOG_LEVEL, compression: { - request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + request: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_COMPRESSION_REQUEST === "1", }, - maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + maxOpenConnections: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, }); } @@ -366,14 +366,14 @@ function buildOrgClickhouseClient(url: string, clientType: ClientType): ClickHou url: parsed.toString(), name, keepAlive: { - enabled: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", - idleSocketTtl: env.REALTIME_RUNS_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, + enabled: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_ENABLED === "1", + idleSocketTtl: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_KEEP_ALIVE_IDLE_SOCKET_TTL_MS, }, - logLevel: env.REALTIME_RUNS_CLICKHOUSE_LOG_LEVEL, + logLevel: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_LOG_LEVEL, compression: { - request: env.REALTIME_RUNS_CLICKHOUSE_COMPRESSION_REQUEST === "1", + request: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_COMPRESSION_REQUEST === "1", }, - maxOpenConnections: env.REALTIME_RUNS_CLICKHOUSE_MAX_OPEN_CONNECTIONS, + maxOpenConnections: env.REALTIME_BACKEND_NATIVE_CLICKHOUSE_MAX_OPEN_CONNECTIONS, }); case "standard": case "query": diff --git a/apps/webapp/app/services/realtime/boundedTtlCache.ts b/apps/webapp/app/services/realtime/boundedTtlCache.ts index 8efcde55609..ac422880ded 100644 --- a/apps/webapp/app/services/realtime/boundedTtlCache.ts +++ b/apps/webapp/app/services/realtime/boundedTtlCache.ts @@ -1,15 +1,6 @@ /** - * Tiny in-process bounded TTL cache shared by the realtime feeds. - * - * Entries expire after `ttlMs`. An expired entry is evicted when read (`get`); on - * write, if the cache is at `maxEntries`, expired entries are swept and, if it's - * still full (pathologically all live), the oldest insertion is dropped. Node is - * single-threaded so no locking is needed. Used where a miss is cheap and - * correctness-safe (read-through hydration, per-handle working sets, per-org flag - * resolution). - * - * A stored value of `undefined` cannot be distinguished from a miss; callers that - * need to cache "absence" should store an explicit sentinel (e.g. `null`). + * Tiny in-process bounded TTL cache shared by the realtime feeds: entries expire after `ttlMs` (evicted on read), + * and at-capacity writes sweep expired entries then drop the oldest. A stored `undefined` is indistinguishable from a miss (use `null` for absence). */ export class BoundedTtlCache { readonly #entries = new Map(); diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts index 003646bb74a..7c74d15add6 100644 --- a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -10,15 +10,9 @@ export type ClickHouseRunListResolverOptions = { }; /** - * Resolves the realtime tag/list filter into matching run ids via ClickHouse - * `listRunIds`. Tag matching is contains-ANY (OR), the same - * semantics the dashboard runs list uses. Filter-only: ids only, hydrated from - * Postgres by id afterward. This keeps the realtime tag feed off the Postgres - * `runTags` GIN index entirely. - * - * (Multi-tag subscribeToRunsWithTag is therefore OR, not the AND that Electric's - * `runTags @> ARRAY[...]` shape used. Restoring AND is a follow-up: add a - * `hasAll` mode to the ClickHouse runs filter and use it here.) + * Resolves the realtime tag/list filter into matching run ids via ClickHouse `listRunIds` (filter-only; + * rows hydrated from Postgres by id afterward). Tag matching is contains-ANY (OR) — note this differs from + * Electric's `runTags @> ARRAY[...]` AND shape; restoring AND needs a `hasAll` mode on the ClickHouse filter. */ export class ClickHouseRunListResolver implements RunListResolver { constructor(private readonly options: ClickHouseRunListResolverOptions) {} diff --git a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts index 6a276bcb03d..efe711a7273 100644 --- a/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts +++ b/apps/webapp/app/services/realtime/electricStreamProtocol.server.ts @@ -1,22 +1,8 @@ /** - * Electric HTTP shape-stream wire protocol serializer for the single-run feed. - * - * This re-emits the exact wire shape that the deployed `@electric-sql/client` - * (1.0.14 modern + 0.4.0 legacy) and the SDK's `SubscribeRunRawShape` expect, - * so the notifier-backed realtime feed stays byte-faithful to what those clients - * already expect. - * - * The module is intentionally pure: no DB, Redis, or env access, so the wire - * contract can be unit-tested by round-tripping through the real client parser - * + the SDK schema. Header rewrites, tokens, and transport live in the client. - * - * Wire facts this encodes (verified against @electric-sql/client@1.0.14): - * - Response body is a JSON array of messages; an empty body is treated as `[]`. - * - Each column value is wire-encoded as a STRING (or null); the client decodes - * it back using the per-column `electric-schema` header. Columns absent from - * the schema are passed through unparsed (so text/timestamp stay strings). - * - `up-to-date` is the only control message that makes the client emit rows. - * - Re-sending the full row each cycle is idempotent: the client merges by `key`. + * Pure (no DB/Redis/env) Electric HTTP shape-stream wire serializer, byte-faithful to what the + * deployed `@electric-sql/client` (1.0.14 + 0.4.0) and the SDK's `SubscribeRunRawShape` expect. + * Each column value is wire-encoded as a string (or null) decoded via the `electric-schema` header; + * `up-to-date` is the only control message that makes the client emit, and re-sending a full row is idempotent. */ export type ElectricColumnType = @@ -33,22 +19,11 @@ type ElectricColumn = { type: ElectricColumnType; /** Array dimensionality. 1 => `type[]` (Postgres `{a,b}` literal). */ dims?: number; - /** - * Array columns only. True when the Postgres column has NO default, so an - * empty/absent value is stored as SQL NULL (Electric emits `null`) rather than - * an empty-array literal `{}`. Prisma erases this distinction — it coerces both - * NULL and `{}` to `[]` on read — so we re-derive the wire form from the column's - * known schema. `runTags` has no default; `realtimeStreams` has `@default([])`. - */ + /** Array columns only: true when the column has no SQL default, so an empty value emits `null` (not `{}`). Prisma erases this distinction, so we re-derive it here. */ emptyArrayAsNull?: boolean; }; -/** - * The columns the realtime run feed exposes, mirroring `DEFAULT_ELECTRIC_COLUMNS` - * in `realtimeClient.server.ts` and their Postgres types from the `TaskRun` - * Prisma model. The `type`/`dims` drive both the `electric-schema` header and - * the value encoding. Keep in sync with `DEFAULT_ELECTRIC_COLUMNS`. - */ +/** Columns the realtime run feed exposes; keep in sync with `DEFAULT_ELECTRIC_COLUMNS`. `type`/`dims` drive the schema header and value encoding. */ export const RUN_ELECTRIC_COLUMNS: ReadonlyArray = [ { name: "id", type: "text" }, { name: "taskIdentifier", type: "text" }, @@ -81,10 +56,7 @@ export const RUN_ELECTRIC_COLUMNS: ReadonlyArray = [ /** Columns that can never be skipped via `skipColumns` (mirrors realtimeClient). */ export const RESERVED_COLUMNS = ["id", "taskIdentifier", "friendlyId", "status", "createdAt"]; -/** - * Shape of a single run hydrated for the realtime feed. Structurally compatible - * with the Prisma `TaskRun` projection produced by `RunHydrator`. - */ +/** A single run hydrated for the realtime feed; structurally compatible with the `RunHydrator` Prisma `TaskRun` projection. */ export type RealtimeRunRow = { id: string; taskIdentifier: string; @@ -219,11 +191,7 @@ export function buildElectricSchemaHeader(skipColumns: string[] = []): string { return JSON.stringify(schema); } -/** - * Initial snapshot body: a single `insert` for the row (if it exists) followed by - * `up-to-date`. An absent row emits a bare `up-to-date` (an empty shape), which is - * how Electric represents "no rows match". - */ +/** Initial snapshot body: an `insert` for the row (if present) then `up-to-date`; an absent row emits a bare `up-to-date` (empty shape). */ export function buildSnapshotBody(row: RealtimeRunRow | null, skipColumns: string[] = []): string { const messages: ShapeMessage[] = []; if (row) { @@ -257,12 +225,7 @@ export function buildUpToDateBody(): string { export type RowChange = { row: RealtimeRunRow; operation: "insert" | "update" }; -/** - * Multi-row body for the tag-list feed: one change message per row (insert for - * rows new to the shape, update for rows that advanced) followed by `up-to-date`. - * An empty `changes` array emits a bare `up-to-date`. The client merges every row - * by key, so re-emitting a full row is idempotent. - */ +/** Multi-row body for the tag-list feed: one change message per row then `up-to-date` (empty `changes` emits a bare `up-to-date`). */ export function buildRowsBody(changes: RowChange[], skipColumns: string[] = []): string { const messages: ShapeMessage[] = changes.map((change) => ({ key: runShapeKey(change.row.id), @@ -295,12 +258,7 @@ export function buildRowsBodyFromSerialized(changes: SerializedRowChange[]): str export const INITIAL_OFFSET = "-1"; -/** - * Opaque offset token, formatted to satisfy the client's `${number}_${number}` - * type. The first segment is the row's `updatedAt` epoch-ms (lets a live request - * detect whether the replica row has advanced past what the client already has); - * the second is a per-connection sequence counter. - */ +/** Opaque `_` offset token (client `${number}_${number}` type); the first segment lets a live request detect whether the row advanced. */ export function encodeOffset(updatedAtMs: number, seq: number): string { return `${Math.trunc(updatedAtMs)}_${Math.trunc(seq)}`; } diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts index bba7a92452d..25b4f912a23 100644 --- a/apps/webapp/app/services/realtime/envChangeRouter.server.ts +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -3,24 +3,10 @@ import { type RealtimeRunRow, serializeRunRow } from "./electricStreamProtocol.s import { logger } from "~/services/logger.server"; /** - * EnvChangeRouter — the per-instance routing layer that turns "feeds as predicates over - * one env stream" into cheap fan-out. - * - * It owns ONE subscription per environment (over the RunChangeNotifier) and an inverted - * index of the feeds currently held by THIS instance: `runId -> feeds`, `tag -> feeds`, - * `batchId -> feeds`. On a coalesced batch of ChangeRecords it: - * 1. routes each record to only the matching held feeds via the index (O(record-tags), - * not O(feeds)) — a record that matches nothing costs nothing; - * 2. batch-hydrates the matched runs from Postgres ONCE per column set (collapsing the - * hot-shared-tag fan-out: one run matching N feeds = one `hydrateByIds`, not N); - * 3. serializes each row's wire value ONCE per column set, reused across all matching - * feeds; - * 4. resolves each matching feed's pending wait with its hydrated+serialized rows. - * - * It is stateless across reconnects: the index is rebuilt from whatever feeds this - * instance happens to hold, so no shape affinity or cross-poll memory is required. The - * per-handle working-set diff (insert vs update) stays in the consumer; the router only - * decides membership, hydrates, and serializes. + * EnvChangeRouter — per-instance routing layer that fans one env's change stream out to the feeds it + * matches. Owns one subscription per env (over the RunChangeNotifier) plus an inverted index of held + * feeds, then per batch: routes via the index, batch-hydrates matched runs once per column set, + * serializes each row's wire value once, and resolves each matched feed's pending wait. Stateless across reconnects. */ export type WakeReason = "notify" | "timeout" | "abort"; @@ -183,9 +169,7 @@ export class EnvChangeRouter { }; this.#envs.set(environmentId, env); env.unsubscribe = this.options.source.subscribeToEnv(environmentId, (records) => { - // Fire-and-forget; the notifier doesn't await us. A hydrate failure must be caught - // here (an unhandled rejection exits the process); the matched feeds' waiters stay - // armed and time out into the full-resolve backstop. + // Fire-and-forget; catch hydrate failures here (unhandled rejection exits the process) — waiters time out into the backstop. this.#onBatch(environmentId, env, records).catch((error) => { logger.error("[envChangeRouter] failed to route a change batch", { environmentId, @@ -341,9 +325,7 @@ export class EnvChangeRouter { } } - /** Authoritative re-check for tag feeds: the hydrated row's tags intersect the filter - * and its createdAt is within the feed's window. Handles partial-record candidates and - * guards record/row tag skew. */ + /** Authoritative re-check for tag feeds: the hydrated row's tags intersect the filter and its createdAt is within the feed's window. */ #tagRowMatches(row: RealtimeRunRow, filter: Extract): boolean { if (filter.createdAtFloorMs !== undefined && row.createdAt.getTime() < filter.createdAtFloorMs) { return false; diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts similarity index 79% rename from apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts rename to apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts index 8d5d597c65b..fd34a1509b8 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts @@ -34,14 +34,10 @@ import { import { type RunHydrator, type RunListResolver } from "./runReader.server"; import { type RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; -/** The tag-list feed resolves ids via ClickHouse, which needs org + project + env. - * `authentication.environment` (AuthenticatedEnvironment) provides projectId, so - * widening here avoids touching the Electric client's RealtimeEnvironment type. */ +/** Widened with projectId so the tag-list feed can resolve ids via ClickHouse (needs org + project + env). */ export type RealtimeListEnvironment = RealtimeEnvironment & { projectId: string }; -/** The realtime feeds the run routes depend on (single-run, tag-list, batch). Both - * the Electric client and this notifier client satisfy it, so the routes can switch - * between them behind a flag. */ +/** The realtime feeds the run routes depend on (single-run, tag-list, batch); both backends satisfy it. */ export interface RealtimeStreamClient { streamRun( url: URL | string, @@ -74,13 +70,10 @@ export interface RealtimeStreamClient { export type WakeupReason = "notify" | "timeout" | "abort"; -/** How a live poll resolved, for observability: - * - `fast-hydrate`: the router woke this feed with matched rows (hydrated by id, NO - * ClickHouse). Non-matching changes never wake the feed, so they cost nothing. - * - `full-resolve`: the backstop timeout did a ClickHouse resolve (the correctness net). */ +/** How a live poll resolved: `fast-hydrate` (router woke us, hydrate-by-id) or `full-resolve` (backstop ClickHouse resolve). */ export type LivePollPath = "fast-hydrate" | "full-resolve"; -export type NotifierRealtimeClientOptions = { +export type NativeRealtimeClientOptions = { runReader: RunHydrator; /** Resolves the tag/list filter into the matching id-set (filter-only). */ runListResolver: RunListResolver; @@ -88,8 +81,12 @@ export type NotifierRealtimeClientOptions = { router: EnvChangeRouter; limiter: RealtimeConcurrencyLimiter; cachedLimitProvider: CachedLimitProvider; - /** Backstop wait before refetching on a live request (ms). Defaults to 5000. */ + /** Fallback per-env concurrent-connection limit when the org has none cached. */ + defaultConcurrencyLimit?: number; + /** Backstop wait before refetching on a live request (ms). Defaults to 20000. */ livePollTimeoutMs?: number; + /** Jitter ratio applied to the live-poll timeout (0.15 = ±15%). */ + livePollJitterRatio?: number; /** Ceiling for the tag-list createdAt lookback window (ms). */ maximumCreatedAtFilterAgeMs: number; /** Hard cap on tag-list snapshot size. Defaults to 1000. */ @@ -100,34 +97,30 @@ export type NotifierRealtimeClientOptions = { runSetResolveCacheMaxEntries?: number; /** Max entries in the per-handle working-set cache. Defaults to 10000. */ listCacheMaxEntries?: number; - /** Epoch-aligned bucket (ms) the tag-list createdAt lower bound is floored to, so - * same-tag feeds pinned within the same bucket share a cache entry. Defaults to - * 60000. 0 disables bucketing. */ + /** TTL (ms) for working-set cache entries. Defaults to 300000. */ + workingSetCacheTtlMs?: number; + /** Epoch-aligned bucket (ms) the tag-list createdAt floor is floored to, so same-tag feeds share a cache entry. Defaults to 60000; 0 disables. */ runSetCreatedAtBucketMs?: number; - /** When true (default), a multi-run live poll holds the connection until a real delta - * or the backstop, rather than returning an empty up-to-date the client would re-issue. */ + /** When true (default), a multi-run live poll holds until a real delta or the backstop rather than returning an empty up-to-date. */ holdOnEmpty?: boolean; - /** Max concurrent fresh ClickHouse resolves (cache misses) across this instance. Bounds a - * distinct-filter reconnect stampede so it queues instead of hammering ClickHouse. Defaults - * to 16; 0 disables the gate (unbounded). */ + /** Max concurrent fresh ClickHouse resolves (cache misses) per instance, bounding a distinct-filter stampede. Defaults to 16; 0 disables. */ resolveAdmissionLimit?: number; /** Observability hook: why a live request woke (notify vs timeout vs abort). */ onWakeup?: (reason: WakeupReason) => void; /** Observability hook: how a live poll resolved (fast path vs full resolve). */ onLivePollPath?: (path: LivePollPath) => void; - /** Observability hook: whether a multi-run resolve (initial/backstop) hit the cache, - * coalesced onto an in-flight resolve, or missed (fresh ClickHouse + Postgres). */ + /** Observability hook: whether a multi-run resolve hit the cache, coalesced onto an in-flight resolve, or missed. */ onRunSetResolve?: (result: "hit" | "miss" | "coalesced") => void; /** Observability hook: latency (ms) of the ClickHouse resolve / Postgres hydrate. */ onRunSetQuery?: (stage: "resolve" | "hydrate", ms: number) => void; - /** Observability hook: a fresh resolve had to wait `ms` for an admission permit (the gate - * engaged — i.e. a stampede was throttled). Not called when a permit is free. */ + /** Observability hook: a fresh resolve waited `ms` for an admission permit (only when the gate engaged). */ onResolveAdmissionWait?: (ms: number) => void; }; const DEFAULT_CONCURRENCY_LIMIT = 100_000; // Matches Electric's ~20s live long-poll hold (jittered ±15% per request). const DEFAULT_LIVE_POLL_TIMEOUT_MS = 20_000; +const DEFAULT_LIVE_POLL_JITTER_RATIO = 0.15; const DEFAULT_MAX_LIST_RESULTS = 1_000; const LIST_CACHE_TTL_MS = 5 * 60_000; const LIST_CACHE_MAX_ENTRIES = 10_000; @@ -136,14 +129,7 @@ const DEFAULT_RUNSET_CACHE_MAX_ENTRIES = 5_000; const DEFAULT_RUNSET_CREATED_AT_BUCKET_MS = 60_000; const DEFAULT_RESOLVE_ADMISSION_LIMIT = 16; -/** - * Fair FIFO semaphore bounding how many fresh ClickHouse resolves run concurrently. It sits - * BEHIND the single-flight + TTL cache, so only genuine cache-miss resolves take a permit: a - * same-filter reconnect stampede still collapses to one in-flight resolve (one permit), while - * a distinct-filter stampede — where every filter is a different cache key and so can't - * coalesce — is throttled to `limit` concurrent CH queries instead of firing all N at the - * database at once. Trades a little connect latency under a stampede for bounded CH load. - */ +/** Fair FIFO semaphore bounding concurrent fresh ClickHouse resolves. Sits behind the single-flight + TTL cache, so only genuine cache-miss resolves take a permit. */ class ResolveAdmissionGate { #available: number; #inUse = 0; @@ -179,16 +165,14 @@ class ResolveAdmissionGate { } } -/** A multi-run feed's filter. Tag-list sets `tags` (+ pinned `createdAtAfter`); - * the batch feed sets `batchId`. Both resolve to an id-set via the resolver. */ +/** A multi-run feed's filter: tag-list sets `tags` (+ pinned `createdAtAfter`); the batch feed sets `batchId`. */ type RunSetFilter = { tags?: string[]; batchId?: string; createdAtAfter?: Date; }; -/** Per-handle working set: runId -> last-emitted updatedAt (ms), so live polls - * emit only rows that advanced. */ +/** Per-handle working set: runId -> last-emitted updatedAt (ms), so live polls emit only rows that advanced. */ type WorkingSet = Map; type ResponseHeaderInput = { @@ -199,43 +183,23 @@ type ResponseHeaderInput = { }; /** - * Notifier-backed implementation of the realtime run feeds. All three feeds are - * predicates over ONE per-environment change stream (the EnvChangeRouter); the router - * decides membership, hydrates the matched runs from a read replica, and serializes their - * wire values once. This client owns the snapshot, the per-handle working-set diff, the - * ClickHouse-backed backstop, and the wire response. - * - * Single-run (`streamRun`): - * - initial (`offset=-1`): hydrate + emit `insert` + `up-to-date` (with schema). - * - live: the router wakes this feed when its run changes; emit a full-row `update` when - * `updatedAt` advanced past what the client has, else a bare `up-to-date`. The backstop - * re-checks via `getRunById`. - * - * Multi-run feeds (`streamRuns` tag-list, `streamBatch`): - * - initial: resolve the matching id-set via ClickHouse (filter-only), hydrate by-id from - * Postgres, emit N `insert`s, seed the working set. - * - live: the router wakes the feed with the matched runs already hydrated + serialized; - * diff them on the authoritative Postgres `updatedAt` against the per-handle working - * set and emit only new/advanced rows. The backstop (timeout) does a full ClickHouse - * resolve — the correctness net that catches gaps and drops departed runs. - * - * Tokens are opaque: `offset` = `_`, `handle` is per-shape, `cursor` - * is a live-only counter. The wire format is produced by `electricStreamProtocol`. + * Native-backend implementation of the realtime run feeds. All three feeds are predicates over ONE + * per-environment change stream (the EnvChangeRouter), which decides membership, hydrates the matched + * runs, and serializes their wire values once; this client owns the snapshot, the per-handle working-set + * diff, the ClickHouse backstop, and the wire response (opaque `offset`/`handle`/`cursor` tokens). */ -export class NotifierRealtimeClient implements RealtimeStreamClient { +export class NativeRealtimeClient implements RealtimeStreamClient { #seq = 0; readonly #workingSetCache: BoundedTtlCache; - /** Coalescing cache for the multi-run (resolveIds -> hydrateByIds) pair used by the - * initial snapshot and the backstop, keyed by (env, filter, columns). Collapses a - * reconnect/snapshot stampede of identical filters into one shared resolve+hydrate. */ + /** Coalescing cache for the multi-run resolve+hydrate, keyed by (env, filter, columns), so identical filters share one resolve. */ readonly #runSetCache: BoundedTtlCache; readonly #runSetInflight = new Map>(); /** Bounds concurrent fresh CH resolves (undefined => unbounded). */ readonly #admissionGate?: ResolveAdmissionGate; - constructor(private readonly options: NotifierRealtimeClientOptions) { + constructor(private readonly options: NativeRealtimeClientOptions) { this.#workingSetCache = new BoundedTtlCache( - LIST_CACHE_TTL_MS, + options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES ); this.#runSetCache = new BoundedTtlCache( @@ -411,12 +375,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { }); } - /** - * Live poll for a single-run feed. The router wakes this feed when its run changes, - * with the run already hydrated + serialized (no ClickHouse, ever). On the backstop - * timeout it re-checks via `getRunById`. Only-on-advance: emit a full-row `update` when - * the row moved past what the client already has; else a bare `up-to-date`. - */ + /** Live poll for a single-run feed: emit a full-row `update` only when the row advanced past the client's offset, else a bare `up-to-date`. */ async #liveResponse(params: { environment: RealtimeEnvironment; runId: string; @@ -526,16 +485,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { }); } - /** - * Live poll for a multi-run feed. Two paths: - * - Fast path (router notify): the router woke us with the matched runs already - * membership-confirmed, hydrated, and serialized (no ClickHouse). Diff them against - * the per-handle working set and emit new/advanced rows. - * - Backstop (timeout): a full ClickHouse resolve + hydrate. The correctness net — - * catches members missed during a gap and drops runs that left the filter. - * With hold-on-empty (default) the connection holds until a real delta or the backstop - * rather than returning an empty response the client would re-issue. - */ + /** Live poll for a multi-run feed: fast path diffs router-notified rows against the working set; the timeout backstop does a full ClickHouse resolve. */ async #runSetLiveResponse( environment: RealtimeListEnvironment, filter: RunSetFilter, @@ -692,12 +642,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { return { changes, maxUpdatedAt, touched }; } - /** - * Diff hydrated rows against the prior working set on the authoritative Postgres - * `updatedAt`: a run not in the set is an `insert`, one whose `updatedAt` advanced is an - * `update`. On a working-set miss, anything past the offset floor is a merge-safe - * `update`. Used by the snapshot and the backstop full-resolve. - */ + /** Diff hydrated rows against the prior working set on Postgres `updatedAt`: not-in-set is `insert`, advanced is `update`. */ #diffRows( rows: RealtimeRunRow[], prevSeen: WorkingSet | undefined, @@ -735,12 +680,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { return merged; } - /** - * Resolve the filter's id-set (ClickHouse) and hydrate the rows (Postgres), coalesced + - * short-TTL cached by (env, filter, columns). Used by the initial snapshot and the - * backstop. A reconnect/snapshot stampede of identical filters shares ONE resolve+hydrate - * (concurrent callers await the in-flight one; callers within the TTL reuse the rows). - */ + /** Resolve the filter's id-set (ClickHouse) and hydrate (Postgres), coalesced + short-TTL cached so identical filters share one resolve+hydrate. */ async #resolveAndHydrate( environment: RealtimeListEnvironment, filter: RunSetFilter, @@ -818,9 +758,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { /** Stable cache key for the resolve+hydrate cache. Same key => same id-set and the * same projected columns, so cached rows always match the requesting feed. */ #runSetCacheKey(environmentId: string, filter: RunSetFilter, skipColumns: string[]): string { - // JSON-encode the arrays (not a join) so a value containing the separators — - // e.g. a tag with a comma — can't collide: ["a,b"] must not key the same as - // ["a","b"], which are different ClickHouse filters. + // JSON-encode the arrays (not a join) so a tag containing the separator can't collide with a different filter. const tags = filter.tags && filter.tags.length > 0 ? JSON.stringify([...filter.tags].sort()) : ""; const cols = skipColumns.length > 0 ? JSON.stringify([...skipColumns].sort()) : ""; const maxListResults = this.options.maxListResults ?? DEFAULT_MAX_LIST_RESULTS; @@ -842,7 +780,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { }); if (ids.length >= maxListResults) { - logger.warn("[notifierRealtimeClient] run-set feed hit the result cap", { + logger.warn("[nativeRealtimeClient] run-set feed hit the result cap", { environmentId: environment.id, filter, cap: maxListResults, @@ -857,11 +795,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { const floor = new Date(Date.now() - this.options.maximumCreatedAtFilterAgeMs); const parsed = safeParseNaturalLanguageDurationAgo(createdAt ?? "24h"); const resolved = !parsed || parsed < floor ? floor : parsed; - // Quantize the lower bound to a coarse epoch-aligned bucket and pin THAT in the - // handle, so same-tag feeds whose windows land in the same bucket resolve to the - // same filter -> same coalescing cache key -> one shared ClickHouse + Postgres - // query instead of one per feed. Floored (rounds the bound earlier), so the - // window only ever widens by < bucket and never drops a run the client should see. + // Bucket the lower bound so same-tag feeds share a cache key; floored, so the window only ever widens by < bucket. return new Date(this.#bucketCreatedAtMs(resolved.getTime())); } @@ -921,10 +855,7 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { }; } - /** - * Runs `work` inside a per-env concurrency slot: acquires a slot (429 if over the - * org limit, 500 if the limit can't be read) and always releases it afterward. - */ + /** Runs `work` inside a per-env concurrency slot (429 if over the org limit, 500 if the limit can't be read), always releasing it after. */ async #withConcurrencySlot( environment: RealtimeEnvironment, work: () => Promise @@ -932,11 +863,11 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { const requestId = randomUUID(); const concurrencyLimit = await this.options.cachedLimitProvider.getCachedLimit( environment.organizationId, - DEFAULT_CONCURRENCY_LIMIT + this.options.defaultConcurrencyLimit ?? DEFAULT_CONCURRENCY_LIMIT ); if (concurrencyLimit == null) { - logger.error("[notifierRealtimeClient] Failed to get concurrency limit", { + logger.error("[nativeRealtimeClient] Failed to get concurrency limit", { organizationId: environment.organizationId, }); return json({ error: "Failed to get concurrency limit" }, { status: 500 }); @@ -961,8 +892,9 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { #jitteredTimeout(): number { const base = this.options.livePollTimeoutMs ?? DEFAULT_LIVE_POLL_TIMEOUT_MS; - // +/-15% jitter to avoid synchronized refetch herds. - return Math.round(base * (0.85 + Math.random() * 0.3)); + // Jittered to avoid synchronized refetch herds. + const ratio = this.options.livePollJitterRatio ?? DEFAULT_LIVE_POLL_JITTER_RATIO; + return Math.round(base * (1 - ratio + Math.random() * 2 * ratio)); } #buildResponse( @@ -978,18 +910,11 @@ export class NotifierRealtimeClient implements RealtimeStreamClient { responseHeaders.set("content-type", "application/json"); responseHeaders.set("cache-control", "no-store"); - // Carry CORS on the response itself, mirroring how the Electric upstream does - // (apiCors passes a response through untouched once it has allow-origin). Browsers - // can only read the electric-* headers cross-origin if they're explicitly exposed; - // without this the deployed react-hooks fail with MissingHeadersError. Bearer-token - // requests are non-credentialed, so a wildcard is safe. + // Expose the electric-* headers cross-origin or the deployed react-hooks fail with MissingHeadersError (bearer requests are non-credentialed, so wildcard is safe). responseHeaders.set("access-control-allow-origin", "*"); responseHeaders.set("access-control-expose-headers", "*"); - // Modern clients (1.0.14) send `x-trigger-electric-version` and read the - // lowercase `electric-*` headers. Legacy clients (0.4.0) omit the version and - // read `electric-shape-id`/`electric-chunk-last-offset` (case-insensitive), - // matching realtimeClient's rewriteResponseHeaders behavior exactly. + // Modern clients send `x-trigger-electric-version` and read `electric-offset`/`electric-handle`; legacy clients omit it and read the shape-id/chunk-last-offset names. if (clientVersion) { responseHeaders.set("electric-offset", headers.offset); responseHeaders.set("electric-handle", headers.handle); diff --git a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts similarity index 66% rename from apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts rename to apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts index 24d5f13b0c6..04e3435bb37 100644 --- a/apps/webapp/app/services/realtime/notifierRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts @@ -7,33 +7,30 @@ import { getCachedLimit } from "../platform.v3.server"; import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; import { ClickHouseRunListResolver } from "./clickHouseRunListResolver.server"; import { EnvChangeRouter } from "./envChangeRouter.server"; -import { NotifierRealtimeClient } from "./notifierRealtimeClient.server"; +import { NativeRealtimeClient } from "./nativeRealtimeClient.server"; import { RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; import { getRunChangeNotifier } from "./runChangeNotifierInstance.server"; import { RunHydrator } from "./runReader.server"; -/** - * Process-singleton wiring for the notifier-backed realtime client. Only - * constructed when a request actually routes to the - * notifier backend, so a disabled webapp never instantiates it. - */ -function initializeNotifierRealtimeClient(): NotifierRealtimeClient { +// Process-singleton wiring for the native realtime client; only constructed when a +// request actually routes to it, so a disabled webapp never instantiates it. +function initializeNativeRealtimeClient(): NativeRealtimeClient { const wakeups = new Counter({ - name: "realtime_notifier_wakeups_total", - help: "Live realtime notifier wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishChangeRecord delegate.", + name: "realtime_native_wakeups_total", + help: "Live realtime wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishChangeRecord delegate.", labelNames: ["reason"] as const, registers: [metricsRegister], }); const runSetResolves = new Counter({ - name: "realtime_notifier_runset_resolve_total", + name: "realtime_native_runset_resolve_total", help: "Multi-run (tag-list/batch) resolve+hydrate outcomes. 'hit'/'coalesced' vs 'miss' shows how effectively concurrent same-filter feeds share a single ClickHouse + Postgres query under an env-wide wake.", labelNames: ["result"] as const, registers: [metricsRegister], }); const runSetQueryMs = new Histogram({ - name: "realtime_notifier_runset_query_ms", + name: "realtime_native_runset_query_ms", help: "Latency of the multi-run resolve (ClickHouse) and hydrate (Postgres) stages.", labelNames: ["stage"] as const, buckets: [1, 5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000], @@ -41,26 +38,26 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { }); const livePollPaths = new Counter({ - name: "realtime_notifier_live_poll_total", + name: "realtime_native_live_poll_total", help: "How live polls resolved. 'fast-hydrate' = the router woke the feed with matched runs hydrated by id (no ClickHouse); 'full-resolve' = the backstop timeout did a ClickHouse resolve. A high fast-path share is the local-membership routing working.", labelNames: ["path"] as const, registers: [metricsRegister], }); const routerHydrates = new Counter({ - name: "realtime_notifier_router_hydrated_runs_total", + name: "realtime_native_router_hydrated_runs_total", help: "Runs hydrated by the EnvChangeRouter's batch-hydrate (one query per column set per wake, shared across all feeds matching the same run — the hot-shared-tag fan-out collapse).", registers: [metricsRegister], }); const resolveAdmissionWaits = new Counter({ - name: "realtime_notifier_resolve_admission_waits_total", + name: "realtime_native_resolve_admission_waits_total", help: "Fresh ClickHouse resolves that had to queue for an admission permit. A rising count means a distinct-filter reconnect stampede is being throttled (the gate is doing its job).", registers: [metricsRegister], }); const limiter = new RealtimeConcurrencyLimiter({ - keyPrefix: "tr:realtime:notifier:concurrency", + keyPrefix: "tr:realtime:native:concurrency", redis: { port: env.RATE_LIMIT_REDIS_PORT, host: env.RATE_LIMIT_REDIS_HOST, @@ -71,9 +68,12 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { }, }); - // One RunHydrator shared by the router (fast-path batch-hydrate) and the client - // (snapshot + backstop), so its single-flight + short-TTL cache covers both. - const runReader = new RunHydrator({ replica: $replica }); + // One RunHydrator shared by the router and the client, so its single-flight + short-TTL cache covers both. + const runReader = new RunHydrator({ + replica: $replica, + cacheTtlMs: env.REALTIME_BACKEND_NATIVE_RUN_CACHE_TTL_MS, + maxCacheEntries: env.REALTIME_BACKEND_NATIVE_RUN_CACHE_MAX_ENTRIES, + }); const router = new EnvChangeRouter({ source: getRunChangeNotifier(), @@ -81,7 +81,7 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { onHydrate: (runCount) => routerHydrates.inc(runCount), }); - const client = new NotifierRealtimeClient({ + const client = new NativeRealtimeClient({ runReader, runListResolver: new ClickHouseRunListResolver({ getClickhouse: (organizationId) => @@ -100,15 +100,18 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { return result.val; }, }, - livePollTimeoutMs: env.REALTIME_NOTIFIER_LIVE_POLL_TIMEOUT_MS, + defaultConcurrencyLimit: env.REALTIME_BACKEND_NATIVE_DEFAULT_CONCURRENCY_LIMIT, + livePollTimeoutMs: env.REALTIME_BACKEND_NATIVE_LIVE_POLL_TIMEOUT_MS, + livePollJitterRatio: env.REALTIME_BACKEND_NATIVE_LIVE_POLL_JITTER_RATIO, maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, - maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, - runSetResolveCacheTtlMs: env.REALTIME_NOTIFIER_RUNSET_CACHE_TTL_MS, - runSetResolveCacheMaxEntries: env.REALTIME_NOTIFIER_RUNSET_CACHE_MAX_ENTRIES, - listCacheMaxEntries: env.REALTIME_NOTIFIER_WORKING_SET_MAX_ENTRIES, - runSetCreatedAtBucketMs: env.REALTIME_NOTIFIER_RUNSET_CREATED_AT_BUCKET_MS, - holdOnEmpty: env.REALTIME_NOTIFIER_HOLD_ON_EMPTY === "1", - resolveAdmissionLimit: env.REALTIME_NOTIFIER_RESOLVE_ADMISSION_LIMIT, + maxListResults: env.REALTIME_BACKEND_NATIVE_MAX_LIST_RESULTS, + runSetResolveCacheTtlMs: env.REALTIME_BACKEND_NATIVE_RUNSET_CACHE_TTL_MS, + runSetResolveCacheMaxEntries: env.REALTIME_BACKEND_NATIVE_RUNSET_CACHE_MAX_ENTRIES, + listCacheMaxEntries: env.REALTIME_BACKEND_NATIVE_WORKING_SET_MAX_ENTRIES, + workingSetCacheTtlMs: env.REALTIME_BACKEND_NATIVE_WORKING_SET_TTL_MS, + runSetCreatedAtBucketMs: env.REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS, + holdOnEmpty: env.REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY === "1", + resolveAdmissionLimit: env.REALTIME_BACKEND_NATIVE_RESOLVE_ADMISSION_LIMIT, onWakeup: (reason) => wakeups.inc({ reason }), onLivePollPath: (path) => livePollPaths.inc({ path }), onRunSetResolve: (result) => runSetResolves.inc({ result }), @@ -117,7 +120,7 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { }); new Gauge({ - name: "realtime_notifier_working_set_size", + name: "realtime_native_working_set_size", help: "Entries in the per-handle working-set cache (one per active multi-run feed session).", registers: [metricsRegister], collect() { @@ -126,7 +129,7 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { }); new Gauge({ - name: "realtime_notifier_resolve_admission_in_use", + name: "realtime_native_resolve_admission_in_use", help: "Fresh ClickHouse resolves currently holding an admission permit (live concurrency against the gate's limit).", registers: [metricsRegister], collect() { @@ -137,6 +140,6 @@ function initializeNotifierRealtimeClient(): NotifierRealtimeClient { return client; } -export function getNotifierRealtimeClient(): NotifierRealtimeClient { - return singleton("notifierRealtimeClient", initializeNotifierRealtimeClient); +export function getNativeRealtimeClient(): NativeRealtimeClient { + return singleton("nativeRealtimeClient", initializeNativeRealtimeClient); } diff --git a/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts index a935858fef0..1b6fdb3b0b4 100644 --- a/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts +++ b/apps/webapp/app/services/realtime/realtimeConcurrencyLimiter.server.ts @@ -11,12 +11,8 @@ export type RealtimeConcurrencyLimiterOptions = { }; /** - * Per-environment concurrent-connection limiter for realtime long-polls. - * - * This is a standalone copy of the limiter embedded in `realtimeClient.server.ts` - * (Electric path), so the notifier-backed client can enforce the same per-env cap - * WITHOUT modifying the existing Electric client. The Lua + key shape are - * identical; only the key prefix differs, so the two paths track independently. + * Per-environment concurrent-connection limiter for realtime long-polls; a standalone copy of the limiter in + * `realtimeClient.server.ts` (identical Lua + key shape, different key prefix) so the native backend tracks independently. */ export class RealtimeConcurrencyLimiter { private redis: RedisClient; @@ -24,7 +20,7 @@ export class RealtimeConcurrencyLimiter { constructor(private options: RealtimeConcurrencyLimiterOptions) { this.redis = createRedisClient( - options.connectionName ?? "trigger:realtime:notifier:concurrency", + options.connectionName ?? "trigger:realtime:native:concurrency", options.redis ); this.expiryTimeInSeconds = options.expiryTimeInSeconds ?? 60 * 5; @@ -35,7 +31,7 @@ export class RealtimeConcurrencyLimiter { const key = this.#getKey(environmentId); const now = Date.now(); - const result = await this.redis.incrementAndCheckRealtimeNotifierConcurrency( + const result = await this.redis.incrementAndCheckRealtimeNativeConcurrency( key, now.toString(), requestId, @@ -57,7 +53,7 @@ export class RealtimeConcurrencyLimiter { } #registerCommands() { - this.redis.defineCommand("incrementAndCheckRealtimeNotifierConcurrency", { + this.redis.defineCommand("incrementAndCheckRealtimeNativeConcurrency", { numberOfKeys: 1, lua: /* lua */ ` local concurrencyKey = KEYS[1] @@ -98,7 +94,7 @@ export class RealtimeConcurrencyLimiter { declare module "ioredis" { interface RedisCommander { - incrementAndCheckRealtimeNotifierConcurrency( + incrementAndCheckRealtimeNativeConcurrency( key: string, timestamp: string, requestId: string, diff --git a/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts index 220f79f9308..69ca81cf2cc 100644 --- a/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts +++ b/apps/webapp/app/services/realtime/resolveRealtimeStreamClient.server.ts @@ -1,51 +1,50 @@ import { $replica } from "~/db.server"; import { env } from "~/env.server"; +import { singleton } from "~/utils/singleton"; import { FEATURE_FLAG } from "~/v3/featureFlags"; import { makeFlag } from "~/v3/featureFlags.server"; import { logger } from "../logger.server"; import { type RealtimeEnvironment } from "../realtimeClient.server"; import { realtimeClient } from "../realtimeClientGlobal.server"; import { BoundedTtlCache } from "./boundedTtlCache"; -import { type RealtimeStreamClient } from "./notifierRealtimeClient.server"; -import { getNotifierRealtimeClient } from "./notifierRealtimeClientInstance.server"; +import { type RealtimeStreamClient } from "./nativeRealtimeClient.server"; +import { getNativeRealtimeClient } from "./nativeRealtimeClientInstance.server"; import { getShadowRealtimeClient } from "./shadowRealtimeClientInstance.server"; -type RealtimeBackend = "electric" | "notifier" | "shadow"; +type RealtimeBackend = "electric" | "native" | "shadow"; -/** - * Chooses which backend serves a realtime run request. - * - * Two gates, both defaulting to the Electric path: - * 1. `REALTIME_NOTIFIER_ENABLED` (env master switch). When off, this returns the - * Electric client immediately — no flag read, no notifier client construction, - * byte-identical to pre-Electric-Sunset behavior. - * 2. the `realtimeBackend` feature flag (global + per-org, org wins), resolved per - * org and cached in-process for 30s so the long-poll feed doesn't hit the DB - * on every request. - */ -const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; -const BACKEND_CACHE_TTL_MS = 30_000; -// Org count is bounded, but cap to avoid unbounded growth. -const BACKEND_CACHE_MAX_ENTRIES = 50_000; +// Two gates, both defaulting to the Electric path: the env master switch, then the +// per-org `realtimeBackend` feature flag (cached so long-polls don't hit the DB per request). +const nativeBackendEnabled = env.REALTIME_BACKEND_NATIVE_ENABLED === "1"; -const flag = makeFlag($replica); -const backendCache = new BoundedTtlCache( - BACKEND_CACHE_TTL_MS, - BACKEND_CACHE_MAX_ENTRIES +const flag = singleton("realtimeBackendFlag", () => makeFlag($replica)); +const backendCache = singleton( + "realtimeBackendCache", + () => + new BoundedTtlCache( + env.REALTIME_BACKEND_FLAG_CACHE_TTL_MS, + env.REALTIME_BACKEND_FLAG_CACHE_MAX_ENTRIES + ) ); export async function resolveRealtimeStreamClient( - environment: RealtimeEnvironment + environment: RealtimeEnvironment & { organization?: { featureFlags?: unknown } } ): Promise { - if (!notifierEnabled) { + if (!nativeBackendEnabled) { return realtimeClient; } - switch (await getRealtimeBackend(environment.organizationId)) { - case "notifier": - return getNotifierRealtimeClient(); + // The authenticated environment already carries the org's feature flags; pass them + // through so a cache miss doesn't need an extra organization read. + const orgFeatureFlags = environment.organization + ? environment.organization.featureFlags ?? {} + : undefined; + + switch (await getRealtimeBackend(environment.organizationId, orgFeatureFlags)) { + case "native": + return getNativeRealtimeClient(); case "shadow": - // Client is still served Electric; the notifier path is diffed in the background. + // The client is still served Electric; the native path is diffed in the background. return getShadowRealtimeClient(); case "electric": default: @@ -53,7 +52,10 @@ export async function resolveRealtimeStreamClient( } } -async function getRealtimeBackend(organizationId: string): Promise { +async function getRealtimeBackend( + organizationId: string, + orgFeatureFlags: unknown | undefined +): Promise { const cached = backendCache.get(organizationId); if (cached !== undefined) { return cached; @@ -62,18 +64,23 @@ async function getRealtimeBackend(organizationId: string): Promise) ?? {}, + overrides: (overrides as Record) ?? {}, }); } catch (error) { - // Never let a flag lookup failure break the realtime feed — fall back to Electric. + // Never let a flag lookup failure break the realtime feed. logger.error("[resolveRealtimeStreamClient] failed to resolve realtimeBackend flag", { organizationId, error, diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts index f975af05723..58b1e5bb931 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -4,15 +4,9 @@ import { logger } from "../logger.server"; export const CHANGE_RECORD_VERSION = 1; /** - * A run-change fact, published once to the run's environment channel. Self-describing: - * - `envId` routes it to its channel (mandatory). - * - `tags` / `batchId` let a tag/batch feed decide membership LOCALLY, without a - * ClickHouse re-resolve. `tags` present (even `[]`) marks a "full" record; `tags` - * absent marks a "partial" record (envId+runId only) that a tag feed must hydrate to - * classify. `batchId` present only when the run is in a batch. - * - `runId` lets a single-run feed match; `createdAtMs` lets a tag feed apply its - * createdAt floor locally; `updatedAtMs`/`status` are hints. - * Row state (payload/output/...) is never on the wire — it's refetched from Postgres. + * A self-describing run-change fact published once to the run's environment channel; row state is + * never on the wire. `tags` present (even `[]`) marks a "full" record a feed can classify locally; + * `tags` absent marks a "partial" record (envId+runId only) a tag feed must hydrate to classify. */ export type ChangeRecord = { v: number; @@ -32,9 +26,7 @@ export function encodeChangeRecord(record: ChangeRecord): string { return JSON.stringify(record); } -/** Decode a wire message into a ChangeRecord. Tolerant of a bare runId (no membership - * data) so a malformed/legacy frame degrades to a partial record (hydrate-to-classify) - * rather than throwing. */ +/** Decode a wire message into a ChangeRecord; a bare/malformed frame degrades to a partial record rather than throwing. */ export function decodeChangeRecord(message: string): ChangeRecord { if (message.length === 0 || message[0] !== "{") { return { v: 0, runId: message, envId: "" }; @@ -64,19 +56,9 @@ export type RunChangeNotifierOptions = { /** Channel name prefix; the envId is appended inside a hash-tag for slot locality. */ channelPrefix?: string; connectionName?: string; - /** - * Leading-edge throttle (ms) for the per-env channel: deliver the first wake - * immediately, then at most one more per window while changes keep arriving. Bounds the - * wake rate per env regardless of run throughput. Defaults to 100ms. 0 disables it. - */ + /** Leading-edge throttle (ms) for the per-env channel, bounding the wake rate per env. Defaults to 100ms; 0 disables. */ envWakeCoalesceWindowMs?: number; - /** - * Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH) instead of classic pub/sub. Only - * valid against a Redis Cluster (channels are hash-tagged by envId, so each lands on one - * shard) and requires the client built with `clusterOptions.shardedSubscribers: true`. - * Classic PUBLISH in a cluster broadcasts to every node, so sharded pub/sub is what - * actually distributes the load. Defaults to false (classic, for single-node / local). - */ + /** Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH); cluster-only and requires `clusterOptions.shardedSubscribers`. Defaults to false (classic). */ shardedPubSub?: boolean; }; @@ -84,38 +66,16 @@ const DEFAULT_CHANNEL_PREFIX = "realtime:"; const DEFAULT_ENV_WAKE_COALESCE_WINDOW_MS = 100; /** - * RunChangeNotifier — carries "run X changed" facts from write sites to the realtime - * feed over ONE per-environment channel. - * - * Design constraints baked in here: - * - ONE channel type, `env:{}`. A change is one fact published once; who - * cares about it is a predicate evaluated by the consumer (the EnvChangeRouter), not a - * second channel. Single-run, tag, and batch feeds all read this one stream. - * - Minimal wire data (a self-describing `ChangeRecord` of small keys), never row - * columns. Row state is always refetched from Postgres. - * - ONE shared, multiplexed subscriber connection per process with a refcounted - * `Map>`. The RunQueue pattern, deliberately NOT the - * per-subscribe-connection pattern of ZodPubSub/tracePubSub (which would exhaust - * ElastiCache `maxclients`). - * - Connections are created lazily: a process that never publishes or subscribes (the - * default, flag-off state) opens no Redis connections at all. - * - `publish` is fire-and-forget and never throws; a dropped publish only costs latency - * because the consumer has a timeout backstop. - * - * Channels are hash-tagged (`env:{}`) so an env's traffic lands on one - * cluster slot. With `shardedPubSub` (cluster only) the feed uses SSUBSCRIBE/SPUBLISH so - * each env's traffic stays on one shard rather than broadcasting cluster-wide. + * RunChangeNotifier — carries "run X changed" facts from write sites to the realtime feeds over ONE + * per-environment channel (`env:{}`, hash-tagged so an env stays on one cluster slot). + * Uses one shared multiplexed subscriber per process (refcounted), created lazily, and a fire-and-forget + * `publish` that never throws — a dropped publish only costs latency because the consumer has a backstop. */ export class RunChangeNotifier { #publisher: RedisClient | undefined; #subscriber: RedisClient | undefined; readonly #listeners = new Map void>>(); - /** - * Per-channel accumulator of records since the last delivery, deduped by runId. A - * coalesced env window collapses many publishes into one wake; this holds the batch so - * the wake carries every run that moved, not just the last one (latest record per run - * wins, keeping the freshest keys). - */ + /** Per-channel accumulator of records since the last delivery, deduped by runId (latest per run wins), so a coalesced wake carries every run that moved. */ readonly #pending = new Map>(); readonly #channelPrefix: string; readonly #connectionName: string; @@ -134,10 +94,7 @@ export class RunChangeNotifier { this.#sharded = options.shardedPubSub ?? false; } - /** - * Fire-and-forget publish of a run-changed fact to the run's environment channel. Never - * throws. The notifier stamps the record version. - */ + /** Fire-and-forget publish of a run-changed fact to the run's environment channel; never throws. */ publish(input: ChangeRecordInput): void { const record: ChangeRecord = { v: CHANGE_RECORD_VERSION, ...input }; this.#publishToChannel(this.#channelForEnv(record.envId), encodeChangeRecord(record)); @@ -174,12 +131,7 @@ export class RunChangeNotifier { } } - /** - * Subscribe (persistently) to an environment's run-change stream. `onBatch` is invoked - * with the coalesced batch of records on every wake until the returned unsubscribe is - * called. Refcounted over the shared subscriber: the first listener for an env issues - * SUBSCRIBE, the last one UNSUBSCRIBE. - */ + /** Subscribe to an env's run-change stream; refcounted over the shared subscriber (first listener SUBSCRIBEs, last UNSUBSCRIBEs). */ subscribeToEnv(environmentId: string, onBatch: (records: ChangeRecord[]) => void): () => void { const channel = this.#channelForEnv(environmentId); const subscriber = this.#ensureSubscriber(); @@ -210,10 +162,7 @@ export class RunChangeNotifier { } current.delete(onBatch); if (current.size === 0) { - // Drop the channel from the map only AFTER Redis confirms UNSUBSCRIBE, and only if - // no new listener re-subscribed while it was in flight. The map entry's existence - // mirrors "subscribed (or subscribe in flight) in Redis", so the subscribe path - // safely reuses it without a duplicate SUBSCRIBE. + // Drop the channel from the map only after Redis confirms UNSUBSCRIBE and no new listener re-subscribed in the meantime. this.#unsubscribeChannel(subscriber, channel) .then(() => { const latest = this.#listeners.get(channel); @@ -223,9 +172,7 @@ export class RunChangeNotifier { if (latest.size === 0) { this.#listeners.delete(channel); } else { - // A listener arrived during the in-flight UNSUBSCRIBE; the channel is now - // unsubscribed in Redis but has live listeners. Re-subscribe so they keep - // receiving messages (the long-poll backstop covers the gap). + // A listener arrived during the in-flight UNSUBSCRIBE; re-subscribe so it keeps receiving (the backstop covers the gap). this.#subscribeChannel(subscriber, channel).catch((error) => { logger.error("[runChangeNotifier] Failed to re-subscribe to run-change channel", { error, @@ -235,9 +182,7 @@ export class RunChangeNotifier { } }) .catch((error) => { - // UNSUBSCRIBE failed: the channel is likely still subscribed in Redis. Keep the - // (empty) map entry so a future subscriber reuses it without a duplicate - // SUBSCRIBE and #onMessage stays consistent with Redis state. + // UNSUBSCRIBE failed (likely still subscribed in Redis): keep the empty map entry so a future subscriber reuses it. logger.error("[runChangeNotifier] Failed to unsubscribe from run-change channel", { error, channel, @@ -334,12 +279,7 @@ export class RunChangeNotifier { } } - /** - * Leading-edge throttle: deliver the first wake immediately, then suppress further wakes - * for the window, delivering one trailing wake if any messages arrived during it (and - * re-opening while activity continues). Caps the wake rate per env to ~1/window no - * matter how fast runs change. Lossless: the batch accumulates across the window. - */ + /** Leading-edge throttle capping the wake rate to ~1/window: deliver the first wake immediately, then one trailing wake per window while activity continues. Lossless. */ #deliverCoalesced(channel: string) { if (this.#coalesceTimers.has(channel)) { this.#coalesceDirty.add(channel); diff --git a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts index fa5f5681f90..b7d90122db0 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierHandlers.server.ts @@ -4,32 +4,18 @@ import { logger } from "../logger.server"; import { publishChangeRecord } from "./runChangeNotifierInstance.server"; /** - * ChangeRecordBuilder — builds and publishes a self-describing `ChangeRecord` to the run's - * environment channel for the lifecycle events whose engine-bus payload already carries - * env + tags + batchId. One publish per change; `envId` is always present. - * - * The terminal transitions (runSucceeded/runFailed/runExpired/runCancelled), - * runAttemptFailed, and runMetadataUpdated publish from `runEngineHandlers.server.ts` - * instead — those events don't carry env/tags/batchId on the bus, but that file already - * re-reads the run (or resolves the env) for each, so the publish piggybacks on the - * existing read rather than widening the event bus. So fully disabling publishing is the - * env master switch (`REALTIME_NOTIFIER_ENABLED`), not just deleting this file. - * - * Coverage is intentionally not exhaustive: a dropped or uncovered transition only adds - * latency because the consumer has a periodic backstop full-resolve. + * Builds and publishes a self-describing `ChangeRecord` for the lifecycle events whose engine-bus payload + * already carries env + tags + batchId. Terminal transitions, runAttemptFailed, and runMetadataUpdated publish + * from `runEngineHandlers.server.ts` instead. Coverage isn't exhaustive — a dropped transition only adds latency + * because the consumer has a periodic backstop full-resolve. The env master switch is `REALTIME_BACKEND_NATIVE_ENABLED`. */ export function registerRunChangeNotifierHandlers() { - // Return a truthy value in every path so the singleton() wrapper (which uses ??=) caches - // the result and never re-runs this factory — re-running would attach duplicate - // engine-bus listeners on each Remix dev-mode reload. - if (env.REALTIME_NOTIFIER_ENABLED !== "1") { + // Return truthy in every path so singleton() caches this factory and never re-runs it (re-running would attach duplicate engine-bus listeners on dev reload). + if (env.REALTIME_BACKEND_NATIVE_ENABLED !== "1") { return true; } - // Run created (trigger). The first signal a tag/batch feed gets for a brand-new run: a - // freshly-created run is born QUEUED with no status transition, so without this it only - // surfaces on the consumer's periodic backstop resolve (and not at all before ClickHouse - // ingests it). Routing the create record hydrates the new run by id straight from Postgres. + // Run created: the first signal for a brand-new run (born QUEUED with no status transition), so it surfaces before ClickHouse ingests it. engine.eventBus.on("runCreated", ({ run, environment }) => { publishChangeRecord({ runId: run.id, diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index ed1d1ce12b2..c24a822ba4e 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -5,33 +5,29 @@ import { singleton } from "~/utils/singleton"; import { RunChangeNotifier, type ChangeRecordInput } from "./runChangeNotifier.server"; /** - * Process-singleton wiring for the RunChangeNotifier plus the thin, gated - * convenience functions that write sites and the realtime route delegate to. - * - * The notifier is constructed lazily (only on the first publish/subscribe when - * enabled), so a webapp running with `REALTIME_NOTIFIER_ENABLED=0` (the default) - * opens no Redis connections and registers no metrics for this subsystem. + * Process-singleton wiring for the RunChangeNotifier plus the gated convenience functions write sites + * delegate to. The notifier is constructed lazily, so `REALTIME_BACKEND_NATIVE_ENABLED=0` (default) opens no Redis connections. */ -const notifierEnabled = env.REALTIME_NOTIFIER_ENABLED === "1"; +const nativeBackendEnabled = env.REALTIME_BACKEND_NATIVE_ENABLED === "1"; function initializeRunChangeNotifier(): RunChangeNotifier { - const clusterMode = env.REALTIME_RUNS_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1"; + const clusterMode = env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1"; // Sharded pub/sub only works against a cluster; classic pub/sub there would // broadcast every message to every node, so this is what actually shards load. - const shardedPubSub = clusterMode && env.REALTIME_RUNS_PUBSUB_REDIS_SHARDED_ENABLED === "1"; + const shardedPubSub = clusterMode && env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_SHARDED_ENABLED === "1"; const notifier = new RunChangeNotifier({ redis: { - host: env.REALTIME_RUNS_PUBSUB_REDIS_HOST, - port: env.REALTIME_RUNS_PUBSUB_REDIS_PORT, - username: env.REALTIME_RUNS_PUBSUB_REDIS_USERNAME, - password: env.REALTIME_RUNS_PUBSUB_REDIS_PASSWORD, - tlsDisabled: env.REALTIME_RUNS_PUBSUB_REDIS_TLS_DISABLED === "true", + host: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_HOST, + port: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PORT, + username: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_USERNAME, + password: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_TLS_DISABLED === "true", clusterMode, // One subscriber connection per shard so SSUBSCRIBE routes to the slot owner. ...(shardedPubSub ? { clusterOptions: { shardedSubscribers: true } } : {}), }, - envWakeCoalesceWindowMs: env.REALTIME_NOTIFIER_ENV_WAKE_COALESCE_WINDOW_MS, + envWakeCoalesceWindowMs: env.REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS, shardedPubSub, }); @@ -54,20 +50,20 @@ export function getRunChangeNotifier(): RunChangeNotifier { /** Whether the notifier subsystem is enabled for this process. */ export function isRunChangeNotifierEnabled(): boolean { - return notifierEnabled; + return nativeBackendEnabled; } /** Fire-and-forget publish of a run-changed record. No-op (and no notifier construction) * when disabled, so publish sites can call it unconditionally. */ export function publishChangeRecord(input: ChangeRecordInput): void { - if (!notifierEnabled) { + if (!nativeBackendEnabled) { return; } getRunChangeNotifier().publish(input); } export function publishManyChangeRecords(inputs: ChangeRecordInput[]): void { - if (!notifierEnabled) { + if (!nativeBackendEnabled) { return; } getRunChangeNotifier().publishMany(inputs); diff --git a/apps/webapp/app/services/realtime/runReader.server.ts b/apps/webapp/app/services/realtime/runReader.server.ts index 4135e94366b..e8509d73de4 100644 --- a/apps/webapp/app/services/realtime/runReader.server.ts +++ b/apps/webapp/app/services/realtime/runReader.server.ts @@ -3,16 +3,9 @@ import { BoundedTtlCache } from "./boundedTtlCache"; import { RESERVED_COLUMNS, type RealtimeRunRow } from "./electricStreamProtocol.server"; /** - * RunReader — the pluggable read half of the notifier-backed realtime feed. - * - * The mandate: ClickHouse is filter-only and resolves IDs, - * Postgres always hydrates row columns. This file owns the Postgres hydration - * half (`RunHydrator`, by-id) and the `RunListResolver` interface (the tag/list - * filter -> id-set seam, implemented over ClickHouse). - * - * Splitting hydration behind this small surface keeps the realtime feed - * decoupled from where runs physically live, ready for a future `TaskRunFast` - * table or a non-Postgres row store. + * RunReader — the pluggable read half of the native-backend realtime feed: ClickHouse is filter-only + * (resolves ids), Postgres always hydrates row columns. Owns the `RunHydrator` (by-id) and the + * `RunListResolver` interface (the tag/list filter -> id-set seam, implemented over ClickHouse). */ /** The TaskRun columns the realtime feed projects (mirrors DEFAULT_ELECTRIC_COLUMNS). */ @@ -45,13 +38,7 @@ export const RUN_HYDRATOR_SELECT = { realtimeStreams: true, } satisfies Prisma.TaskRunSelect; -/** - * Columns the feed needs internally regardless of the client's `skipColumns`: - * `id` keys the row, `updatedAt` drives the offset and the live working-set diff. - * Everything else can be projected away when the client skips it (see - * `buildHydratorSelect`), so the replica doesn't ship large `payload`/`output`/ - * `metadata`/`error` columns the response will drop anyway. - */ +/** Columns hydrated regardless of `skipColumns`: `id` keys the row, `updatedAt` drives the offset and working-set diff. */ const ALWAYS_HYDRATED_COLUMNS = new Set(["id", "updatedAt", ...RESERVED_COLUMNS]); /** Project `RUN_HYDRATOR_SELECT` down to the columns the client didn't skip (plus @@ -84,12 +71,7 @@ export type RunListFilter = { limit: number; }; -/** - * Resolves a tag/list filter into the matching run id-set, filter-only (no row - * columns; rows are hydrated from Postgres by id afterward). Pluggable so the - * resolution source can change without touching the feed. The ClickHouse - * implementation lives in `clickHouseRunListResolver.server.ts`. - */ +/** Resolves a tag/list filter into the matching run id-set, filter-only (rows hydrated from Postgres by id afterward). ClickHouse impl in `clickHouseRunListResolver.server.ts`. */ export interface RunListResolver { resolveMatchingRunIds(filter: RunListFilter): Promise; } @@ -97,11 +79,7 @@ export interface RunListResolver { export type RunHydratorOptions = { /** A read-replica Prisma client (`$replica`). Always Postgres. */ replica: Pick; - /** - * Read-through cache TTL (ms) to collapse duplicate refetches across a burst - * of live polls for the same run. Fan-in is low in practice, so this is - * insurance, not load-bearing. Set to 0 to disable. Defaults to 250ms. - */ + /** Read-through cache TTL (ms) collapsing duplicate refetches for the same run. Set 0 to disable. Defaults to 250ms. */ cacheTtlMs?: number; /** Hard cap on cache entries before expired entries are swept. */ maxCacheEntries?: number; @@ -110,11 +88,7 @@ export type RunHydratorOptions = { const DEFAULT_CACHE_TTL_MS = 250; const DEFAULT_MAX_CACHE_ENTRIES = 5_000; -/** - * Hydrates a single run by id from the read replica, projected to the realtime - * columns. Concurrent refetches for the same (env, run) are single-flighted, and - * a short TTL cache collapses rapid repeats. - */ +/** Hydrates runs by id from the read replica, projected to the realtime columns; concurrent same-run refetches are single-flighted + short-TTL cached. */ export class RunHydrator { readonly #inflight = new Map>(); readonly #cache: BoundedTtlCache; @@ -156,9 +130,7 @@ export class RunHydrator { return row; } - /** Hydrate many runs by id in one query (tag/list feed). Order is not guaranteed. - * `skipColumns` projects the SELECT so the replica doesn't ship columns the client - * dropped (notably the large `payload`/`output`/`metadata`/`error` columns). */ + /** Hydrate many runs by id in one query (order not guaranteed); `skipColumns` projects the SELECT so dropped columns aren't shipped. */ async hydrateByIds( environmentId: string, ids: string[], diff --git a/apps/webapp/app/services/realtime/shadowCompare.server.ts b/apps/webapp/app/services/realtime/shadowCompare.server.ts index b24540bfca3..27831dd68a2 100644 --- a/apps/webapp/app/services/realtime/shadowCompare.server.ts +++ b/apps/webapp/app/services/realtime/shadowCompare.server.ts @@ -6,24 +6,10 @@ import { import { type RunHydrator, type RunListFilter, type RunListResolver } from "./runReader.server"; /** - * Dual-run shadow-compare. - * - * The client is always served the Electric response; in the background this - * re-derives what the notifier path WOULD emit and diffs the two, so we can prove - * parity on real production traffic before any cutover. - * - * Two kinds of divergence are checked: - * - serialization: for each run Electric emitted, re-hydrate it and serialize via - * the notifier serializer, then compare SEMANTICALLY (decode both sides per - * column type) so equivalent-but-differently-encoded wire values (timestamp - * format, bool t/true, number formatting) are not false positives. The compare - * is gated on same-version (matching updatedAt) so a row that changed between - * Electric's emit and our refetch is recorded as "skew", not a divergence. - * - membership (tag/batch initial snapshot only): the set of run ids Electric - * emitted vs the set the notifier resolver returns. This is where the known - * tag OR-vs-AND difference shows up. - * - * Pure except for the injected RunHydrator/RunListResolver, so it's unit-testable. + * Dual-run shadow-compare: the client is always served the Electric response while this re-derives what + * the native backend would emit and diffs the two, to prove parity on real traffic before cutover. Checks + * serialization (semantic per-column compare, gated on same updatedAt so a changed row is "skew", not a + * divergence) and membership (emitted id-set, only on tag/batch initial snapshots). Pure but for the injected deps. */ export type ShadowFeed = "run" | "runs" | "batch"; @@ -42,7 +28,7 @@ export type ColumnDiff = { runId: string; column: string; electric: string | null; - notifier: string | null; + native: string | null; }; export type ShadowCompareOutcome = { @@ -57,8 +43,8 @@ export type ShadowCompareOutcome = { diffs: ColumnDiff[]; /** Set membership (tag/batch initial snapshot only). undefined when not checked. */ membershipMatch?: boolean; - missingInNotifier?: string[]; - extraInNotifier?: string[]; + missingInNative?: string[]; + extraInNative?: string[]; }; export type ShadowCompareInput = { @@ -116,11 +102,11 @@ export class RealtimeShadowComparator { continue; } - const notifierValue = serializeRunRow(row, input.skipColumns); + const nativeValue = serializeRunRow(row, input.skipColumns); // Only compare rows at the same version; otherwise the row advanced between // Electric's emit and our refetch (timing skew, not a divergence). - if (!sameInstant(message.value.updatedAt, notifierValue.updatedAt)) { + if (!sameInstant(message.value.updatedAt, nativeValue.updatedAt)) { outcome.serializationSkew++; continue; } @@ -131,11 +117,11 @@ export class RealtimeShadowComparator { if (!meta) { continue; } - const notifierRaw = notifierValue[column] ?? null; - if (!valuesEqual(electricRaw, notifierRaw, meta.type, meta.dims, column)) { + const nativeRaw = nativeValue[column] ?? null; + if (!valuesEqual(electricRaw, nativeRaw, meta.type, meta.dims, column)) { rowDiverged = true; if (outcome.diffs.length < MAX_DIFFS) { - outcome.diffs.push({ runId, column, electric: electricRaw, notifier: notifierRaw }); + outcome.diffs.push({ runId, column, electric: electricRaw, native: nativeRaw }); } } } @@ -151,14 +137,14 @@ export class RealtimeShadowComparator { const electricIds = new Set( changes.map((m) => m.value.id).filter((id): id is string => typeof id === "string") ); - const notifierIds = new Set( + const nativeIds = new Set( await this.options.runListResolver.resolveMatchingRunIds(input.membershipFilter) ); - outcome.missingInNotifier = [...electricIds].filter((id) => !notifierIds.has(id)); - outcome.extraInNotifier = [...notifierIds].filter((id) => !electricIds.has(id)); + outcome.missingInNative = [...electricIds].filter((id) => !nativeIds.has(id)); + outcome.extraInNative = [...nativeIds].filter((id) => !electricIds.has(id)); outcome.membershipMatch = - outcome.missingInNotifier.length === 0 && outcome.extraInNotifier.length === 0; + outcome.missingInNative.length === 0 && outcome.extraInNative.length === 0; } return outcome; @@ -194,36 +180,36 @@ function sameInstant(a: string | null | undefined, b: string | null | undefined) function valuesEqual( electricRaw: string | null, - notifierRaw: string | null, + nativeRaw: string | null, type: ElectricColumnType, dims: number | undefined, column: string ): boolean { - if (electricRaw == null || notifierRaw == null) { - return electricRaw == null && notifierRaw == null; + if (electricRaw == null || nativeRaw == null) { + return electricRaw == null && nativeRaw == null; } if (dims && dims > 0) { - return arraysEqual(parsePgTextArray(electricRaw), parsePgTextArray(notifierRaw)); + return arraysEqual(parsePgTextArray(electricRaw), parsePgTextArray(nativeRaw)); } switch (type) { case "timestamp": - return new Date(`${electricRaw}Z`).getTime() === new Date(`${notifierRaw}Z`).getTime(); + return new Date(`${electricRaw}Z`).getTime() === new Date(`${nativeRaw}Z`).getTime(); case "bool": - return parseBool(electricRaw) === parseBool(notifierRaw); + return parseBool(electricRaw) === parseBool(nativeRaw); case "int4": case "int8": case "float8": - return Number(electricRaw) === Number(notifierRaw); + return Number(electricRaw) === Number(nativeRaw); case "jsonb": - return jsonEqual(electricRaw, notifierRaw); + return jsonEqual(electricRaw, nativeRaw); case "text": default: if (column === "status") { - return normalizeStatus(electricRaw) === normalizeStatus(notifierRaw); + return normalizeStatus(electricRaw) === normalizeStatus(nativeRaw); } - return electricRaw === notifierRaw; + return electricRaw === nativeRaw; } } diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts index b66b70e7ad5..90bc1d90070 100644 --- a/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/shadowRealtimeClient.server.ts @@ -9,7 +9,7 @@ import { RESERVED_COLUMNS } from "./electricStreamProtocol.server"; import { type RealtimeListEnvironment, type RealtimeStreamClient, -} from "./notifierRealtimeClient.server"; +} from "./nativeRealtimeClient.server"; import { type RunListFilter } from "./runReader.server"; import { type RealtimeShadowComparator, @@ -29,13 +29,7 @@ export type ShadowRealtimeClientOptions = { onOutcome?: (outcome: ShadowCompareOutcome) => void; }; -/** - * Dual-run gate: a transparent wrapper that serves the Electric - * response unchanged and, in the background, diffs what the notifier path would emit - * against it. The shadow work is fire-and-forget — it never blocks or fails the - * client's request — and it exercises the read replica so the notifier's real load - * can be measured before cutover. - */ +/** Transparent wrapper that serves the Electric response unchanged and, in the background (fire-and-forget), diffs what the native backend would emit. */ export class ShadowRealtimeClient implements RealtimeStreamClient { constructor(private readonly options: ShadowRealtimeClientOptions) {} @@ -178,8 +172,8 @@ export class ShadowRealtimeClient implements RealtimeStreamClient { serializationMatched: outcome.serializationMatched, serializationSkew: outcome.serializationSkew, membershipMatch: outcome.membershipMatch, - missingInNotifier: outcome.missingInNotifier?.slice(0, 20), - extraInNotifier: outcome.extraInNotifier?.slice(0, 20), + missingInNative: outcome.missingInNative?.slice(0, 20), + extraInNative: outcome.extraInNative?.slice(0, 20), // Log only which run/column diverged, never the raw cell values — they can // include run payload/output/metadata and must not leak into logs. diffs: outcome.diffs.map(({ runId, column }) => ({ runId, column })), diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts index 95edc82620d..041e8edd5d4 100644 --- a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -17,7 +17,7 @@ import { ShadowRealtimeClient } from "./shadowRealtimeClient.server"; function initializeShadowRealtimeClient(): ShadowRealtimeClient { const compares = new Counter({ name: "realtime_shadow_compare_total", - help: "Dual-run shadow-compare outcomes (Electric vs notifier). kind=serialization|membership, result=match|diverge|skew.", + help: "Dual-run shadow-compare outcomes (Electric vs native). kind=serialization|membership, result=match|diverge|skew.", labelNames: ["feed", "kind", "result"] as const, registers: [metricsRegister], }); @@ -35,7 +35,7 @@ function initializeShadowRealtimeClient(): ShadowRealtimeClient { electric: realtimeClient, comparator, maximumCreatedAtFilterAgeMs: env.REALTIME_MAXIMUM_CREATED_AT_FILTER_AGE_IN_MS, - maxListResults: env.REALTIME_NOTIFIER_MAX_LIST_RESULTS, + maxListResults: env.REALTIME_BACKEND_NATIVE_MAX_LIST_RESULTS, onOutcome: (outcome) => { const { feed } = outcome; if (outcome.serializationMatched) { diff --git a/apps/webapp/app/v3/featureFlags.ts b/apps/webapp/app/v3/featureFlags.ts index 55b30a8396e..3066f2dda01 100644 --- a/apps/webapp/app/v3/featureFlags.ts +++ b/apps/webapp/app/v3/featureFlags.ts @@ -25,8 +25,8 @@ export const FeatureFlagCatalog = { [FEATURE_FLAG.workerQueueScheduledSplitEnabled]: z.coerce.boolean(), // Which backend serves the realtime run feed. Controllable // globally and per-org (org wins). Defaults to "electric" when unset. - // "shadow" serves Electric but diffs the notifier path in the background. - [FEATURE_FLAG.realtimeBackend]: z.enum(["electric", "notifier", "shadow"]), + // "shadow" serves Electric but diffs the native path in the background. + [FEATURE_FLAG.realtimeBackend]: z.enum(["electric", "native", "shadow"]), }; export type FeatureFlagKey = keyof typeof FeatureFlagCatalog; diff --git a/apps/webapp/app/v3/runEngineHandlers.server.ts b/apps/webapp/app/v3/runEngineHandlers.server.ts index 7ef4efdef82..c8d9240154e 100644 --- a/apps/webapp/app/v3/runEngineHandlers.server.ts +++ b/apps/webapp/app/v3/runEngineHandlers.server.ts @@ -48,7 +48,7 @@ export function registerRunEngineEventBusHandlers() { taskEventStore: true, // Piggyback the realtime run-changed publish on this existing read so the // per-env channel carries the membership keys (no separate query). No-op when - // the notifier is disabled. + // the native backend is disabled. runTags: true, batchId: true, }, @@ -129,7 +129,7 @@ export function registerRunEngineEventBusHandlers() { organizationId: true, taskEventStore: true, // Piggyback the realtime run-changed publish on this existing read (no-op when - // the notifier is disabled). + // the native backend is disabled). runTags: true, batchId: true, }, @@ -197,7 +197,7 @@ export function registerRunEngineEventBusHandlers() { organizationId: true, taskEventStore: true, // Piggyback the realtime run-changed publish on this existing read (no-op when - // the notifier is disabled). + // the native backend is disabled). runTags: true, batchId: true, }, @@ -389,7 +389,7 @@ export function registerRunEngineEventBusHandlers() { organizationId: true, taskEventStore: true, // Piggyback the realtime run-changed publish on this existing read (no-op when - // the notifier is disabled). + // the native backend is disabled). runTags: true, batchId: true, }, @@ -454,7 +454,7 @@ export function registerRunEngineEventBusHandlers() { organizationId: true, taskEventStore: true, // Piggyback the realtime run-changed publish on this existing read (no-op when - // the notifier is disabled). + // the native backend is disabled). runTags: true, batchId: true, }, diff --git a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts similarity index 96% rename from apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts rename to apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts index e0c51d57f52..4a356dd7e5f 100644 --- a/apps/webapp/test/realtime/notifierHoldOnEmpty.test.ts +++ b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts @@ -1,9 +1,9 @@ import { setTimeout as sleep } from "node:timers/promises"; import { CURRENT_API_VERSION } from "~/api/versions"; import { - NotifierRealtimeClient, + NativeRealtimeClient, type RealtimeListEnvironment, -} from "~/services/realtime/notifierRealtimeClient.server"; +} from "~/services/realtime/nativeRealtimeClient.server"; import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; import { EnvChangeRouter, @@ -67,7 +67,7 @@ function makeClient(overrides: Record = {}) { const src = fakeSource(); const router = new EnvChangeRouter({ source: src.source, hydrator: { hydrateByIds: hydrateSpy } }); - const client = new NotifierRealtimeClient({ + const client = new NativeRealtimeClient({ runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, runListResolver: { resolveMatchingRunIds: resolveSpy } as any, router, @@ -83,7 +83,7 @@ function makeClient(overrides: Record = {}) { return { client, src, hydrateSpy, resolveSpy, setRows: (rows: RealtimeRunRow[]) => (rowsToReturn = rows) }; } -function liveRuns(client: NotifierRealtimeClient) { +function liveRuns(client: NativeRealtimeClient) { return client.streamRuns( `http://localhost:3030/realtime/v1/runs?offset=${FLOOR_MS}_1&live=true&handle=runs_${FLOOR_MS}_7`, ENV, @@ -111,7 +111,7 @@ const hasRowOp = (body: Awaited>) => const isUpToDate = (body: Awaited>) => body.some((m) => m?.headers?.control === "up-to-date"); -describe("NotifierRealtimeClient multi-run live path over the router", () => { +describe("NativeRealtimeClient multi-run live path over the router", () => { it("a matching change hydrates by id (no ClickHouse) and returns a delta", async () => { const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient(); setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); diff --git a/apps/webapp/test/realtime/notifierRealtimeClient.test.ts b/apps/webapp/test/realtime/nativeRealtimeClient.test.ts similarity index 95% rename from apps/webapp/test/realtime/notifierRealtimeClient.test.ts rename to apps/webapp/test/realtime/nativeRealtimeClient.test.ts index 5f7b96fc099..3d113556679 100644 --- a/apps/webapp/test/realtime/notifierRealtimeClient.test.ts +++ b/apps/webapp/test/realtime/nativeRealtimeClient.test.ts @@ -1,8 +1,8 @@ import { CURRENT_API_VERSION } from "~/api/versions"; import { - NotifierRealtimeClient, + NativeRealtimeClient, type RealtimeListEnvironment, -} from "~/services/realtime/notifierRealtimeClient.server"; +} from "~/services/realtime/nativeRealtimeClient.server"; import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; import { describe, expect, it } from "vitest"; @@ -41,7 +41,7 @@ function sampleRow(): RealtimeRunRow { // Only the initial-snapshot path is exercised here, which touches the shared // #buildResponse — enough to lock the response-header contract. function makeClient(row: RealtimeRunRow | null) { - return new NotifierRealtimeClient({ + return new NativeRealtimeClient({ runReader: { getRunById: async () => row, hydrateByIds: async () => (row ? [row] : []), @@ -64,7 +64,7 @@ const ENV: RealtimeListEnvironment = { projectId: "proj_1", }; -describe("NotifierRealtimeClient response headers", () => { +describe("NativeRealtimeClient response headers", () => { it("exposes electric headers cross-origin so browser hooks can read them", async () => { const client = makeClient(sampleRow()); const res = await client.streamRun( diff --git a/apps/webapp/test/realtime/notifierRunSetCache.test.ts b/apps/webapp/test/realtime/nativeRunSetCache.test.ts similarity index 95% rename from apps/webapp/test/realtime/notifierRunSetCache.test.ts rename to apps/webapp/test/realtime/nativeRunSetCache.test.ts index 7a6449a9eb7..34beaee2d65 100644 --- a/apps/webapp/test/realtime/notifierRunSetCache.test.ts +++ b/apps/webapp/test/realtime/nativeRunSetCache.test.ts @@ -1,8 +1,8 @@ import { CURRENT_API_VERSION } from "~/api/versions"; import { - NotifierRealtimeClient, + NativeRealtimeClient, type RealtimeListEnvironment, -} from "~/services/realtime/notifierRealtimeClient.server"; +} from "~/services/realtime/nativeRealtimeClient.server"; import { type RealtimeRunRow } from "~/services/realtime/electricStreamProtocol.server"; import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; import { setTimeout as sleep } from "node:timers/promises"; @@ -23,7 +23,7 @@ function makeClient(overrides: Record = {}) { const resolveSpy = vi.fn(async () => ["run_1", "run_2"]); const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); - const client = new NotifierRealtimeClient({ + const client = new NativeRealtimeClient({ runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, runListResolver: { resolveMatchingRunIds: resolveSpy } as any, // No-op source: live polls never get a router wake, so they fall through to the @@ -44,7 +44,7 @@ function makeClient(overrides: Record = {}) { // streamBatch with offset=-1 takes the snapshot path, which calls the coalescing // resolve+hydrate directly (no concurrency slot / subscription needed). -function snapshot(client: NotifierRealtimeClient, batchId: string, skipColumns?: string) { +function snapshot(client: NativeRealtimeClient, batchId: string, skipColumns?: string) { const skip = skipColumns ? `&skipColumns=${skipColumns}` : ""; return client.streamBatch( `http://localhost:3030/realtime/v1/batches/${batchId}?offset=-1${skip}`, @@ -57,7 +57,7 @@ function snapshot(client: NotifierRealtimeClient, batchId: string, skipColumns?: } // Tag-list snapshot (offset=-1) — exercises the createdAt bucketing + cache key. -function snapshotTag(client: NotifierRealtimeClient, tags: string[]) { +function snapshotTag(client: NativeRealtimeClient, tags: string[]) { return client.streamRuns( "http://localhost:3030/realtime/v1/runs?offset=-1", ENV, @@ -68,7 +68,7 @@ function snapshotTag(client: NotifierRealtimeClient, tags: string[]) { ); } -describe("NotifierRealtimeClient run-set resolve coalescing + cache", () => { +describe("NativeRealtimeClient run-set resolve coalescing + cache", () => { it("coalesces concurrent same-filter resolves into one ClickHouse + Postgres query", async () => { const { client, resolveSpy, hydrateSpy } = makeClient(); let release!: (ids: string[]) => void; @@ -152,7 +152,7 @@ describe("NotifierRealtimeClient run-set resolve coalescing + cache", () => { }); }); -describe("NotifierRealtimeClient resolve admission gate (mass-reconnect stampede)", () => { +describe("NativeRealtimeClient resolve admission gate (mass-reconnect stampede)", () => { // A resolver that blocks each invocation until released, so we can watch how many run // concurrently. Tracks peak concurrency and exposes a release-one-at-a-time drain. function gatedResolver() { @@ -176,7 +176,7 @@ describe("NotifierRealtimeClient resolve admission gate (mass-reconnect stampede function makeGatedClient(resolveAdmissionLimit: number, resolver: ReturnType) { const hydrateSpy = vi.fn(async (_env: string, ids: string[]) => ids.map(row)); - return new NotifierRealtimeClient({ + return new NativeRealtimeClient({ runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, runListResolver: { resolveMatchingRunIds: resolver.resolve } as any, router: new EnvChangeRouter({ @@ -230,7 +230,7 @@ describe("NotifierRealtimeClient resolve admission gate (mass-reconnect stampede }); }); -describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { +describe("NativeRealtimeClient tag-list createdAt bucketing", () => { it("floors the resolved createdAt lower bound to the bucket boundary", async () => { // Fix the clock to a non-bucket-aligned instant so the assertion is deterministic. vi.useFakeTimers({ toFake: ["Date"] }); @@ -286,7 +286,7 @@ describe("NotifierRealtimeClient tag-list createdAt bucketing", () => { }); }); -describe("NotifierRealtimeClient review fixes", () => { +describe("NativeRealtimeClient review fixes", () => { // makeClient's router has a no-op source, so the live poll never gets a wake and falls // through to its backstop timeout — the full ClickHouse resolve these tests assert on // (createdAt clamp / concurrency limit). diff --git a/apps/webapp/test/realtime/shadowCompare.test.ts b/apps/webapp/test/realtime/shadowCompare.test.ts index e6604a02cd6..0d5f431f0bf 100644 --- a/apps/webapp/test/realtime/shadowCompare.test.ts +++ b/apps/webapp/test/realtime/shadowCompare.test.ts @@ -80,7 +80,7 @@ describe("RealtimeShadowComparator serialization", () => { it("does not flag semantically-equivalent but differently-encoded values", async () => { const row = sampleRow(); - // Electric encodes bool as "true" (notifier uses "t"), a number with a trailing + // Electric encodes bool as "true" (native uses "t"), a number with a trailing // zero, and a timestamp without millis — all equal after decoding. const value = { ...serializeRunRow(row), @@ -120,7 +120,7 @@ describe("RealtimeShadowComparator serialization", () => { expect(out.serializationDiverged).toBe(1); expect(out.serializationMatched).toBe(0); expect(out.diffs).toEqual([ - { runId: "run_a", column: "payload", electric: '{"hello":"TAMPERED"}', notifier: '{"hello":"world"}' }, + { runId: "run_a", column: "payload", electric: '{"hello":"TAMPERED"}', native: '{"hello":"world"}' }, ]); }); @@ -178,7 +178,7 @@ describe("RealtimeShadowComparator membership", () => { return JSON.stringify([...msgs, UP_TO_DATE]); } - it("matches when Electric's set equals the notifier resolver's set", async () => { + it("matches when Electric's set equals the native resolver's set", async () => { const cmp = makeComparator( { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, ["a", "b"] @@ -192,14 +192,14 @@ describe("RealtimeShadowComparator membership", () => { membershipFilter: filter, }); expect(out.membershipMatch).toBe(true); - expect(out.missingInNotifier).toEqual([]); - expect(out.extraInNotifier).toEqual([]); + expect(out.missingInNative).toEqual([]); + expect(out.extraInNative).toEqual([]); }); - it("reports rows missing from / extra in the notifier resolution", async () => { + it("reports rows missing from / extra in the native resolution", async () => { const cmp = makeComparator( { a: sampleRow({ id: "a" }), b: sampleRow({ id: "b" }) }, - ["a", "c"] // notifier missing b, has extra c + ["a", "c"] // native missing b, has extra c ); const out = await cmp.compare({ feed: "runs", @@ -210,7 +210,7 @@ describe("RealtimeShadowComparator membership", () => { membershipFilter: filter, }); expect(out.membershipMatch).toBe(false); - expect(out.missingInNotifier).toEqual(["b"]); - expect(out.extraInNotifier).toEqual(["c"]); + expect(out.missingInNative).toEqual(["b"]); + expect(out.extraInNative).toEqual(["c"]); }); }); From 77ac66a8b77166c268df4ec3e45701453c79ad3a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 13:08:49 +0100 Subject: [PATCH 16/23] fix(webapp): deliver realtime changes that land between long-polls immediately A change published in the short window while a client was between long-polls missed its wake and waited for the backstop, so the last update of a burst could show stale for up to twenty seconds. The router now buffers each environment's recent change records, keeps the environment subscription alive briefly after the last feed disconnects, and replays exactly the records a returning poll missed, tracked per connection. A poll arriving on an instance that cannot prove coverage resolves once up front instead of holding blind, and a single-run poll woken by an already-seen record holds instead of returning an empty response the client would instantly re-issue. Verified end to end: a burst task's final update reaches the browser within milliseconds of the database write. --- apps/webapp/app/env.server.ts | 6 + .../realtime/envChangeRouter.server.ts | 171 +++++++++++++++++- .../realtime/nativeRealtimeClient.server.ts | 144 +++++++++++---- .../nativeRealtimeClientInstance.server.ts | 11 ++ .../test/realtime/envChangeRouter.test.ts | 119 +++++++++++- .../test/realtime/nativeHoldOnEmpty.test.ts | 65 ++++++- .../realtime/nativeRealtimeClient.test.ts | 2 + .../test/realtime/nativeRunSetCache.test.ts | 4 + 8 files changed, 472 insertions(+), 50 deletions(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index c6844e50504..41431ba47f1 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -322,6 +322,12 @@ const EnvironmentSchema = z REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY: z.string().default("1"), // Max concurrent fresh ClickHouse resolves per instance (reconnect-stampede gate); 0 disables. REALTIME_BACKEND_NATIVE_RESOLVE_ADMISSION_LIMIT: z.coerce.number().int().default(16), + // Replay window (ms) for buffered change records delivered to newly-armed feeds; 0 disables. + REALTIME_BACKEND_NATIVE_REPLAY_WINDOW_MS: z.coerce.number().int().default(2_000), + // Cap on buffered recent records per env (latest record per run). + REALTIME_BACKEND_NATIVE_REPLAY_MAX_RUNS: z.coerce.number().int().default(512), + // Keep an env subscribed + buffering this long (ms) after its last feed closes; 0 disables. + REALTIME_BACKEND_NATIVE_UNSUBSCRIBE_LINGER_MS: z.coerce.number().int().default(5_000), // Fallback per-env concurrent-connection limit when the org has none configured. REALTIME_BACKEND_NATIVE_DEFAULT_CONCURRENCY_LIMIT: z.coerce.number().int().default(100_000), // TTL/size of the single-run read-through cache that collapses duplicate refetch bursts. diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts index 25b4f912a23..7fac5cf5dd6 100644 --- a/apps/webapp/app/services/realtime/envChangeRouter.server.ts +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -40,8 +40,20 @@ export type EnvChangeRouterOptions = { hydrator: RowHydrator; /** Observability: a hydrate-by-id batch ran (count = runs hydrated this tick). */ onHydrate?: (runCount: number) => void; + /** How far back (ms) a newly-armed feed replays buffered records. 0 disables replay. */ + replayWindowMs?: number; + /** Cap on buffered recent records per env (latest record per run). */ + replayMaxRunsPerEnv?: number; + /** How long (ms) to keep an env subscribed + buffering after its last feed closes. 0 disables. */ + unsubscribeLingerMs?: number; + /** Observability: a replay scan found candidates and delivered rows (or none survived). */ + onReplay?: (result: "delivered" | "empty") => void; }; +const DEFAULT_REPLAY_WINDOW_MS = 2_000; +const DEFAULT_REPLAY_MAX_RUNS_PER_ENV = 512; +const DEFAULT_UNSUBSCRIBE_LINGER_MS = 5_000; + /** Handle a feed holds for the duration of one long-poll. */ export type FeedRegistration = { /** Wait for the next batch matching this feed (or timeout/abort), with the matched runs @@ -49,6 +61,10 @@ export type FeedRegistration = { waitForMatch(signal: AbortSignal | undefined, timeoutMs: number): Promise; /** Deregister from the index; unsubscribes the env when the last feed leaves. */ close(): void; + /** False when this instance's env subscription is younger than the replay window, so a + * change in the caller's inter-poll gap may have been missed (hop/cold start) — the + * caller should resolve once instead of holding blind. */ + gapCovered: boolean; }; type Feed = { @@ -57,6 +73,8 @@ type Feed = { columnSig: string; /** The currently-waiting poll's resolver (null between polls). */ resolve: ((result: WaitResult) => void) | null; + /** Buffered records at or before this timestamp have been replayed (or predate this feed). */ + replayCursorMs: number; }; type EnvState = { @@ -67,6 +85,12 @@ type EnvState = { byBatchId: Map>; /** All tag feeds, for routing partial records (no tags) as hydrate-to-classify candidates. */ tagFeeds: Set; + /** When this env's channel subscription started (for the gap-coverage check). */ + subscribedAtMs: number; + /** Latest record per run, insertion-ordered, for replaying inter-poll gaps to newly-armed feeds. */ + recent: Map; + /** Pending teardown while the env lingers with zero feeds. */ + lingerTimer?: ReturnType; }; function addToIndex(index: Map>, key: string, feed: Feed) { @@ -93,13 +117,29 @@ export class EnvChangeRouter { constructor(private readonly options: EnvChangeRouterOptions) {} - register(environmentId: string, filter: FeedFilter, skipColumns: string[]): FeedRegistration { + register( + environmentId: string, + filter: FeedFilter, + skipColumns: string[], + opts?: { + /** When the caller last received data for this connection. Bounds the replay to the + * true inter-poll gap; older than the window can't be proven covered. */ + replaySinceMs?: number; + } + ): FeedRegistration { const env = this.#ensureEnv(environmentId); + const replayWindowMs = this.options.replayWindowMs ?? DEFAULT_REPLAY_WINDOW_MS; + const now = Date.now(); + const windowFloorMs = now - replayWindowMs; + const sinceMs = opts?.replaySinceMs ?? windowFloorMs; const feed: Feed = { filter, skipColumns, columnSig: skipColumns.length > 0 ? [...skipColumns].sort().join(",") : "", resolve: null, + // First arm replays the caller's inter-poll gap; later arms only what arrived since. + // The buffer only spans the window, so never rewind past it. + replayCursorMs: Math.max(sinceMs, windowFloorMs), }; env.feeds.add(feed); @@ -129,6 +169,16 @@ export class EnvChangeRouter { onAbort = () => settle({ reason: "abort", rows: [] }); signal.addEventListener("abort", onAbort, { once: true }); } + // Deliver any buffered records this feed hasn't seen (catches changes that + // landed while the caller was between polls). + if (replayWindowMs > 0 && env.recent.size > 0) { + this.#replayRecent(environmentId, env, feed).catch((error) => { + logger.error("[envChangeRouter] failed to replay buffered records", { + environmentId, + error, + }); + }); + } }); const close = () => { @@ -141,12 +191,18 @@ export class EnvChangeRouter { feed.resolve?.({ reason: "abort", rows: [] }); feed.resolve = null; if (env.feeds.size === 0) { - this.#envs.delete(environmentId); - env.unsubscribe(); + this.#scheduleEnvTeardown(environmentId, env); } }; - return { waitForMatch, close }; + return { + waitForMatch, + close, + // Covered when this instance was already subscribed (and buffering) at the gap's + // start, and the gap fits inside the buffer's window. + gapCovered: + replayWindowMs <= 0 || (env.subscribedAtMs <= sinceMs && sinceMs >= windowFloorMs), + }; } /** Distinct environments currently routed (for metrics). */ @@ -157,6 +213,11 @@ export class EnvChangeRouter { #ensureEnv(environmentId: string): EnvState { const existing = this.#envs.get(environmentId); if (existing) { + // A pending teardown is cancelled by new interest; the buffer survives the gap. + if (existing.lingerTimer) { + clearTimeout(existing.lingerTimer); + existing.lingerTimer = undefined; + } return existing; } const env: EnvState = { @@ -166,9 +227,12 @@ export class EnvChangeRouter { byTag: new Map(), byBatchId: new Map(), tagFeeds: new Set(), + subscribedAtMs: Date.now(), + recent: new Map(), }; this.#envs.set(environmentId, env); env.unsubscribe = this.options.source.subscribeToEnv(environmentId, (records) => { + this.#bufferRecent(env, records); // Fire-and-forget; catch hydrate failures here (unhandled rejection exits the process) — waiters time out into the backstop. this.#onBatch(environmentId, env, records).catch((error) => { logger.error("[envChangeRouter] failed to route a change batch", { @@ -180,6 +244,105 @@ export class EnvChangeRouter { return env; } + /** Keep the env subscribed + buffering for a linger after its last feed closes, so a + * client's next poll (or another instance hop landing back here) can replay the gap. */ + #scheduleEnvTeardown(environmentId: string, env: EnvState) { + const lingerMs = this.options.unsubscribeLingerMs ?? DEFAULT_UNSUBSCRIBE_LINGER_MS; + if (lingerMs <= 0) { + this.#envs.delete(environmentId); + env.unsubscribe(); + return; + } + if (env.lingerTimer) { + clearTimeout(env.lingerTimer); + } + env.lingerTimer = setTimeout(() => { + if (env.feeds.size === 0) { + this.#envs.delete(environmentId); + env.unsubscribe(); + } + }, lingerMs); + env.lingerTimer.unref?.(); + } + + /** Upsert the latest record per run (insertion-ordered) and prune to the window + cap. */ + #bufferRecent(env: EnvState, records: ChangeRecord[]) { + const windowMs = this.options.replayWindowMs ?? DEFAULT_REPLAY_WINDOW_MS; + if (windowMs <= 0) { + return; + } + const maxRuns = this.options.replayMaxRunsPerEnv ?? DEFAULT_REPLAY_MAX_RUNS_PER_ENV; + const now = Date.now(); + for (const record of records) { + env.recent.delete(record.runId); + env.recent.set(record.runId, { record, receivedAtMs: now }); + } + const cutoff = now - windowMs; + for (const [runId, entry] of env.recent) { + if (entry.receivedAtMs >= cutoff && env.recent.size <= maxRuns) { + break; + } + env.recent.delete(runId); + } + } + + /** Whether a buffered record matches a feed's predicate (mirrors #onBatch's routing). */ + #recordMatchesFeed(record: ChangeRecord, feed: Feed): boolean { + switch (feed.filter.kind) { + case "run": + return record.runId === feed.filter.runId; + case "batch": + return record.batchId != null && record.batchId === feed.filter.batchId; + case "tag": { + // Partial record (no tags) = hydrate-to-classify candidate, like the live path. + if (record.tags === undefined) { + return true; + } + const tags = feed.filter.tags; + return record.tags.some((tag) => tags.includes(tag)); + } + } + } + + /** Deliver buffered records newer than the feed's cursor through the normal + * hydrate -> serialize -> settle pipeline. Already-seen rows diff to nothing downstream. */ + async #replayRecent(environmentId: string, env: EnvState, feed: Feed) { + const cursor = feed.replayCursorMs; + feed.replayCursorMs = Date.now(); + + const runIds: string[] = []; + for (const [runId, entry] of env.recent) { + if (entry.receivedAtMs > cursor && this.#recordMatchesFeed(entry.record, feed)) { + runIds.push(runId); + } + } + if (runIds.length === 0 || !feed.resolve) { + return; + } + + const hydrated = await this.options.hydrator.hydrateByIds( + environmentId, + runIds, + feed.skipColumns + ); + this.options.onHydrate?.(hydrated.length); + + const rows: MatchedRow[] = []; + for (const row of hydrated) { + if (feed.filter.kind === "tag" && !this.#tagRowMatches(row, feed.filter)) { + continue; + } + rows.push({ row, value: serializeRunRow(row, feed.skipColumns) }); + } + + if (rows.length > 0 && feed.resolve) { + this.options.onReplay?.("delivered"); + feed.resolve({ reason: "notify", rows }); + } else { + this.options.onReplay?.("empty"); + } + } + #indexFeed(env: EnvState, feed: Feed) { switch (feed.filter.kind) { case "run": diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts index fd34a1509b8..41e2944924e 100644 --- a/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts @@ -70,8 +70,9 @@ export interface RealtimeStreamClient { export type WakeupReason = "notify" | "timeout" | "abort"; -/** How a live poll resolved: `fast-hydrate` (router woke us, hydrate-by-id) or `full-resolve` (backstop ClickHouse resolve). */ -export type LivePollPath = "fast-hydrate" | "full-resolve"; +/** How a live poll resolved: `fast-hydrate` (router woke us, hydrate-by-id), `full-resolve` + * (backstop), or `cold-resolve` (fresh env subscription probed once instead of holding blind). */ +export type LivePollPath = "fast-hydrate" | "full-resolve" | "cold-resolve"; export type NativeRealtimeClientOptions = { runReader: RunHydrator; @@ -196,12 +197,19 @@ export class NativeRealtimeClient implements RealtimeStreamClient { readonly #runSetInflight = new Map>(); /** Bounds concurrent fresh CH resolves (undefined => unbounded). */ readonly #admissionGate?: ResolveAdmissionGate; + /** Per-connection: when this connection's last response was sent, so the router's + * replay covers exactly the inter-poll gap instead of rewinding a full window. */ + readonly #replayCursorCache: BoundedTtlCache; constructor(private readonly options: NativeRealtimeClientOptions) { this.#workingSetCache = new BoundedTtlCache( options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES ); + this.#replayCursorCache = new BoundedTtlCache( + options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, + options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES + ); this.#runSetCache = new BoundedTtlCache( options.runSetResolveCacheTtlMs ?? DEFAULT_RUNSET_CACHE_TTL_MS, options.runSetResolveCacheMaxEntries ?? DEFAULT_RUNSET_CACHE_MAX_ENTRIES @@ -396,33 +404,80 @@ export class NativeRealtimeClient implements RealtimeStreamClient { { kind: "run", runId }, skipColumns ); + const deadline = Date.now() + this.#jitteredTimeout(); try { - const { reason, rows } = await registration.waitForMatch(signal, this.#jitteredTimeout()); - this.options.onWakeup?.(reason); - - if (reason === "abort") { - return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { - offset, - handle, - cursor: String(this.#nextSeq()), - }); + // Cold start (fresh env subscription, e.g. an instance hop): a change in the + // caller's inter-poll gap may have been missed — check the row once, then hold. + if (!registration.gapCovered) { + this.options.onLivePollPath?.("cold-resolve"); + const probed = await this.options.runReader.getRunById(environment.id, runId); + if (probed && probed.updatedAt.getTime() > lastSeenMs) { + const seq = this.#nextSeq(); + return this.#buildResponse( + buildUpdateBody(probed, skipColumns), + apiVersion, + clientVersion, + { + offset: encodeOffset(probed.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + } + ); + } } - if (reason === "notify" && rows.length > 0) { - // The router hydrated + serialized this run; emit it (only on advance). - this.options.onLivePollPath?.("fast-hydrate"); - const matched = rows[0]; - const updatedAtMs = matched.row.updatedAt.getTime(); + while (true) { + const remaining = deadline - Date.now(); + const { reason, rows } = + remaining > 0 + ? await registration.waitForMatch(signal, remaining) + : { reason: "timeout" as const, rows: [] as MatchedRow[] }; + this.options.onWakeup?.(reason); + + if (reason === "abort") { + return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { + offset, + handle, + cursor: String(this.#nextSeq()), + }); + } + + if (reason === "notify" && rows.length > 0) { + // The router hydrated + serialized this run; emit it (only on advance). + this.options.onLivePollPath?.("fast-hydrate"); + const matched = rows[0]; + const updatedAtMs = matched.row.updatedAt.getTime(); + if (updatedAtMs > lastSeenMs) { + const seq = this.#nextSeq(); + return this.#buildResponse( + buildRowsBodyFromSerialized([ + { runId: matched.row.id, value: matched.value, operation: "update" }, + ]), + apiVersion, + clientVersion, + { offset: encodeOffset(updatedAtMs, seq), handle, cursor: String(seq) } + ); + } + // Already seen (e.g. a replayed record): keep holding rather than returning an + // empty up-to-date the client would immediately re-issue. + continue; + } + + // Backstop timeout: re-check the run directly (no ClickHouse for the single-run feed). + this.options.onLivePollPath?.("full-resolve"); + const row = await this.options.runReader.getRunById(environment.id, runId); const seq = this.#nextSeq(); - if (updatedAtMs > lastSeenMs) { + if (row && row.updatedAt.getTime() > lastSeenMs) { return this.#buildResponse( - buildRowsBodyFromSerialized([ - { runId: matched.row.id, value: matched.value, operation: "update" }, - ]), + buildUpdateBody(row, skipColumns), apiVersion, clientVersion, - { offset: encodeOffset(updatedAtMs, seq), handle, cursor: String(seq) } + { + offset: encodeOffset(row.updatedAt.getTime(), seq), + handle, + cursor: String(seq), + } ); } return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { @@ -431,23 +486,6 @@ export class NativeRealtimeClient implements RealtimeStreamClient { cursor: String(seq), }); } - - // Backstop timeout: re-check the run directly (no ClickHouse for the single-run feed). - this.options.onLivePollPath?.("full-resolve"); - const row = await this.options.runReader.getRunById(environment.id, runId); - const seq = this.#nextSeq(); - if (row && row.updatedAt.getTime() > lastSeenMs) { - return this.#buildResponse(buildUpdateBody(row, skipColumns), apiVersion, clientVersion, { - offset: encodeOffset(row.updatedAt.getTime(), seq), - handle, - cursor: String(seq), - }); - } - return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { - offset, - handle, - cursor: String(seq), - }); } finally { registration.close(); } @@ -477,6 +515,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); } this.#workingSetCache.set(this.#workingSetKey(environment.id, handle), seen); + this.#replayCursorCache.set(this.#workingSetKey(environment.id, handle), Date.now()); return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { offset: encodeOffset(maxUpdatedAt, this.#nextSeq()), @@ -507,8 +546,10 @@ export class NativeRealtimeClient implements RealtimeStreamClient { const workingSetKey = this.#workingSetKey(environment.id, handle); let prevSeen = this.#workingSetCache.get(workingSetKey); + const markPollEnd = () => this.#replayCursorCache.set(workingSetKey, Date.now()); const emitFromSerialized = (changes: SerializedRowChange[], maxUpdatedAt: number): Response => { const seq = this.#nextSeq(); + markPollEnd(); return this.#buildResponse(buildRowsBodyFromSerialized(changes), apiVersion, clientVersion, { offset: encodeOffset(maxUpdatedAt, seq), handle, @@ -517,6 +558,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { }; const emitFromRows = (changes: RowChange[], maxUpdatedAt: number): Response => { const seq = this.#nextSeq(); + markPollEnd(); return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { offset: encodeOffset(maxUpdatedAt, seq), handle, @@ -525,6 +567,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { }; const emitUpToDate = (maxUpdatedAt: number): Response => { const seq = this.#nextSeq(); + markPollEnd(); return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { offset: encodeOffset(maxUpdatedAt, seq), handle, @@ -535,11 +578,34 @@ export class NativeRealtimeClient implements RealtimeStreamClient { const registration = this.options.router.register( environment.id, this.#feedFilter(filter), - skipColumns + skipColumns, + // When this connection last received data, so replay covers exactly its gap. + { replaySinceMs: this.#replayCursorCache.get(workingSetKey) } ); + // Cold start (fresh env subscription, e.g. an instance hop): resolve once up front + // instead of holding blind — a change in the caller's inter-poll gap may have been missed. + let coldProbe = !registration.gapCovered; + try { while (true) { + if (coldProbe) { + coldProbe = false; + this.options.onLivePollPath?.("cold-resolve"); + const resolved = await this.#resolveAndHydrate(environment, filter, skipColumns); + const { changes, maxUpdatedAt, touched } = this.#diffRows( + resolved, + prevSeen, + offsetFloorMs + ); + this.#workingSetCache.set(workingSetKey, touched); + prevSeen = touched; + if (changes.length > 0) { + return emitFromRows(changes, maxUpdatedAt); + } + continue; // nothing was missed — hold as usual + } + const remaining = deadline - Date.now(); const { reason, rows } = remaining > 0 diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts index 04e3435bb37..af1c214c7ec 100644 --- a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts @@ -56,6 +56,13 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { registers: [metricsRegister], }); + const replays = new Counter({ + name: "realtime_native_replays_total", + help: "Buffered change records replayed to a newly-armed feed (inter-poll gap recovery). 'delivered' = rows reached the feed; 'empty' = candidates hydrated but none survived the filter/diff.", + labelNames: ["result"] as const, + registers: [metricsRegister], + }); + const limiter = new RealtimeConcurrencyLimiter({ keyPrefix: "tr:realtime:native:concurrency", redis: { @@ -79,6 +86,10 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { source: getRunChangeNotifier(), hydrator: runReader, onHydrate: (runCount) => routerHydrates.inc(runCount), + replayWindowMs: env.REALTIME_BACKEND_NATIVE_REPLAY_WINDOW_MS, + replayMaxRunsPerEnv: env.REALTIME_BACKEND_NATIVE_REPLAY_MAX_RUNS, + unsubscribeLingerMs: env.REALTIME_BACKEND_NATIVE_UNSUBSCRIBE_LINGER_MS, + onReplay: (result) => replays.inc({ result }), }); const client = new NativeRealtimeClient({ diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts index 4779a702bc5..46457fa2ca9 100644 --- a/apps/webapp/test/realtime/envChangeRouter.test.ts +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -52,12 +52,19 @@ function fakeSource() { }; } -function makeRouter(rowsById: Map = new Map()) { +function makeRouter( + rowsById: Map = new Map(), + options: Record = {} +) { const src = fakeSource(); const hydrateSpy = vi.fn(async (_env, ids) => ids.map((id) => rowsById.get(id)).filter((r): r is RealtimeRunRow => Boolean(r)) ); - const router = new EnvChangeRouter({ source: src.source, hydrator: { hydrateByIds: hydrateSpy } }); + const router = new EnvChangeRouter({ + source: src.source, + hydrator: { hydrateByIds: hydrateSpy }, + ...options, + }); return { router, src, hydrateSpy }; } @@ -184,7 +191,7 @@ describe("EnvChangeRouter", () => { }); it("times out and aborts cleanly", async () => { - const { router, src } = makeRouter(); + const { router, src } = makeRouter(new Map(), { unsubscribeLingerMs: 0 }); const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); expect((await reg.waitForMatch(undefined, 30)).reason).toBe("timeout"); @@ -193,16 +200,116 @@ describe("EnvChangeRouter", () => { controller.abort(); expect((await wait).reason).toBe("abort"); reg.close(); - expect(src.isSubscribed("env_1")).toBe(false); // last feed left -> unsubscribed + expect(src.isSubscribed("env_1")).toBe(false); // linger disabled: last feed left -> unsubscribed + }); + + it("buffers a record that arrives between polls and replays it on the next arm", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + // Not waiting yet: the push can't wake anything, but it lands in the env buffer. + src.push("env_1", [record("r1", { tags: ["a"] })]); + expect(hydrateSpy).not.toHaveBeenCalled(); + + const result = await reg.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + reg.close(); }); - it("only routes to feeds currently waiting (gaps between polls fall to the backstop)", async () => { + it("does not redeliver a replayed record on a later arm", async () => { const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); const { router, src, hydrateSpy } = makeRouter(rows); const reg = router.register("env_1", { kind: "tag", tags: ["a"] }, []); - // Not waiting yet: a push is dropped (no hydrate, no buffering). src.push("env_1", [record("r1", { tags: ["a"] })]); + expect((await reg.waitForMatch(undefined, 1_000)).reason).toBe("notify"); + + // Same buffered record must not fire again; the wait falls through to its timeout. + expect((await reg.waitForMatch(undefined, 50)).reason).toBe("timeout"); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + reg.close(); + }); + + it("lingers the env subscription after the last feed closes and replays the gap", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows, { unsubscribeLingerMs: 60 }); + const reg1 = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + reg1.close(); + expect(src.isSubscribed("env_1")).toBe(true); // lingering + + // The inter-poll gap: a change arrives while no feed is registered. + src.push("env_1", [record("r1", { tags: ["a"] })]); + + const reg2 = router.register("env_1", { kind: "tag", tags: ["a"] }, []); + const result = await reg2.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + expect(hydrateSpy).toHaveBeenCalledTimes(1); + + reg2.close(); + await new Promise((r) => setTimeout(r, 100)); + expect(src.isSubscribed("env_1")).toBe(false); // linger expired -> unsubscribed + }); + + it("reports gapCovered=false on a fresh env subscription and true once it ages past the window", async () => { + const { router } = makeRouter(new Map(), { replayWindowMs: 50 }); + const reg1 = router.register("env_1", { kind: "run", runId: "r1" }, []); + expect(reg1.gapCovered).toBe(false); + + await new Promise((r) => setTimeout(r, 70)); + const reg2 = router.register("env_1", { kind: "run", runId: "r2" }, []); + expect(reg2.gapCovered).toBe(true); + reg1.close(); + reg2.close(); + }); + + it("honors the caller's replaySinceMs so a new poll doesn't rewind into delivered records", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src, hydrateSpy } = makeRouter(rows); + const anchor = router.register("env_1", { kind: "tag", tags: ["a"] }, []); // keeps the env subscribed + src.push("env_1", [record("r1", { tags: ["a"] })]); + const afterPush = Date.now(); + + // A connection whose last response left after the push: nothing to replay. + const caughtUp = router.register("env_1", { kind: "tag", tags: ["a"] }, [], { + replaySinceMs: afterPush, + }); + expect(caughtUp.gapCovered).toBe(true); // env subscribed since before its gap began + expect((await caughtUp.waitForMatch(undefined, 50)).reason).toBe("timeout"); expect(hydrateSpy).not.toHaveBeenCalled(); + + // A connection whose gap started before the push: the record replays. + const behind = router.register("env_1", { kind: "tag", tags: ["a"] }, [], { + replaySinceMs: afterPush - 1_000, + }); + const result = await behind.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r1"]); + + anchor.close(); + caughtUp.close(); + behind.close(); + }); + + it("caps the replay buffer to the newest records per env", async () => { + const rows = new Map([ + ["r1", row("r1")], + ["r2", row("r2")], + ["r3", row("r3")], + ]); + const { router, src, hydrateSpy } = makeRouter(rows, { replayMaxRunsPerEnv: 2 }); + const reg = router.register("env_1", { kind: "batch", batchId: "batch_1" }, []); + src.push("env_1", [ + record("r1", { batchId: "batch_1" }), + record("r2", { batchId: "batch_1" }), + record("r3", { batchId: "batch_1" }), + ]); + + const result = await reg.waitForMatch(undefined, 1_000); + expect(result.reason).toBe("notify"); + // r1 was evicted by the cap; only the newest two replay. + expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["r2", "r3"], []); reg.close(); }); }); diff --git a/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts index 4a356dd7e5f..43238797d3a 100644 --- a/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts +++ b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts @@ -65,7 +65,14 @@ function makeClient(overrides: Record = {}) { ); const resolveSpy = vi.fn(async () => rowsToReturn.map((r) => r.id)); const src = fakeSource(); - const router = new EnvChangeRouter({ source: src.source, hydrator: { hydrateByIds: hydrateSpy } }); + const router = new EnvChangeRouter({ + source: src.source, + hydrator: { hydrateByIds: hydrateSpy }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, + ...(overrides.routerOptions as Record ?? {}), + }); + delete overrides.routerOptions; const client = new NativeRealtimeClient({ runReader: { getRunById: async () => null, hydrateByIds: hydrateSpy } as any, @@ -175,6 +182,62 @@ describe("NativeRealtimeClient multi-run live path over the router", () => { expect(resolveSpy).toHaveBeenCalled(); }); + it("a cold env registration resolves immediately instead of holding blind", async () => { + // Fresh env subscription (gapCovered=false): a change in the inter-poll gap may have + // been missed, so the live poll probes once. The row advanced past the offset floor. + const { client, resolveSpy, setRows } = makeClient({ + routerOptions: { replayWindowMs: 2_000 }, + }); + setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); + + const res = await liveRuns(client); // no push needed — the cold probe finds the delta + expect(res.status).toBe(200); + expect(resolveSpy).toHaveBeenCalledTimes(1); + expect(hasRowOp(await bodyOf(res))).toBe(true); + }); + + it("a cold probe with nothing missed keeps holding", async () => { + const { client, src, resolveSpy, setRows } = makeClient({ + routerOptions: { replayWindowMs: 2_000 }, + livePollTimeoutMs: 1_500, + }); + setRows([row("run_1", FLOOR_MS - 1_000, { tags: ["t"] })]); // at/below the offset floor + + const responsePromise = liveRuns(client); + let settled = false; + void responsePromise.then(() => (settled = true)); + await whenWaiting(src); + await sleep(50); + expect(settled).toBe(false); // probed, found nothing missed, held + expect(resolveSpy).toHaveBeenCalledTimes(1); + await responsePromise; // drain via the backstop + }); + + it("a single-run poll holds on a replayed already-seen record instead of busy re-polling", async () => { + const { client, src, setRows } = makeClient({ + routerOptions: { replayWindowMs: 2_000 }, + livePollTimeoutMs: 300, + }); + setRows([row("run_1", FLOOR_MS + 1_000)]); + const url = `http://localhost:3030/realtime/v1/runs/run_1?offset=${FLOOR_MS + 1_000}_1&handle=run-run_1&live=true`; + + // First poll subscribes the env, then drains via its backstop. + const first = await client.streamRun(url, ENV, "run_1", CURRENT_API_VERSION, undefined, "1.0.0"); + expect(first.status).toBe(200); + + // The record lands between polls; the lingering env subscription buffers it. + src.push("env_1", [rec("run_1")]); + + // The next poll replays it, but the row hasn't advanced past the client's offset: + // the poll must HOLD (the old behavior returned up-to-date instantly = a busy loop). + let settled = false; + const second = client.streamRun(url, ENV, "run_1", CURRENT_API_VERSION, undefined, "1.0.0"); + void second.then(() => (settled = true)); + await sleep(120); + expect(settled).toBe(false); + expect((await second).status).toBe(200); // drains via the backstop + }); + it("with holdOnEmpty=false, a matched-but-not-advanced change returns up-to-date without ClickHouse", async () => { const { client, src, resolveSpy, setRows } = makeClient({ holdOnEmpty: false }); // Matches the tag and is in-window, but updatedAt is at/below the offset floor -> no delta. diff --git a/apps/webapp/test/realtime/nativeRealtimeClient.test.ts b/apps/webapp/test/realtime/nativeRealtimeClient.test.ts index 3d113556679..b94c72a4e65 100644 --- a/apps/webapp/test/realtime/nativeRealtimeClient.test.ts +++ b/apps/webapp/test/realtime/nativeRealtimeClient.test.ts @@ -51,6 +51,8 @@ function makeClient(row: RealtimeRunRow | null) { router: new EnvChangeRouter({ source: { subscribeToEnv: () => () => {} }, hydrator: { hydrateByIds: async () => (row ? [row] : []) }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, }), limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, cachedLimitProvider: { getCachedLimit: async () => 100 }, diff --git a/apps/webapp/test/realtime/nativeRunSetCache.test.ts b/apps/webapp/test/realtime/nativeRunSetCache.test.ts index 34beaee2d65..2389fd78080 100644 --- a/apps/webapp/test/realtime/nativeRunSetCache.test.ts +++ b/apps/webapp/test/realtime/nativeRunSetCache.test.ts @@ -31,6 +31,8 @@ function makeClient(overrides: Record = {}) { router: new EnvChangeRouter({ source: { subscribeToEnv: () => () => {} }, hydrator: { hydrateByIds: hydrateSpy }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, }), limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, cachedLimitProvider: { getCachedLimit: async () => 100 }, @@ -182,6 +184,8 @@ describe("NativeRealtimeClient resolve admission gate (mass-reconnect stampede)" router: new EnvChangeRouter({ source: { subscribeToEnv: () => () => {} }, hydrator: { hydrateByIds: hydrateSpy }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, }), limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, cachedLimitProvider: { getCachedLimit: async () => 100 }, From 46a0cbbd8bb36daff153cf493bb469cc9875a271 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 14:44:13 +0100 Subject: [PATCH 17/23] fix(webapp): match multi-tag realtime subscriptions to contains-all semantics Subscribing to runs with multiple tags matched runs carrying any of them, while the default backend requires all of them. The ClickHouse resolve and the router's row check now both require every subscribed tag, so multi-tag feeds behave identically across backends. The dashboard runs list keeps its any-match filter. --- .../clickHouseRunListResolver.server.ts | 6 ++++-- .../realtime/envChangeRouter.server.ts | 5 +++-- .../clickhouseRunsRepository.server.ts | 4 +++- .../runsRepository/runsRepository.server.ts | 2 ++ .../test/realtime/envChangeRouter.test.ts | 19 +++++++++++++++++++ 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts index 7c74d15add6..317b1c15454 100644 --- a/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts +++ b/apps/webapp/app/services/realtime/clickHouseRunListResolver.server.ts @@ -11,8 +11,8 @@ export type ClickHouseRunListResolverOptions = { /** * Resolves the realtime tag/list filter into matching run ids via ClickHouse `listRunIds` (filter-only; - * rows hydrated from Postgres by id afterward). Tag matching is contains-ANY (OR) — note this differs from - * Electric's `runTags @> ARRAY[...]` AND shape; restoring AND needs a `hasAll` mode on the ClickHouse filter. + * rows hydrated from Postgres by id afterward). Tag matching is contains-ALL, byte-matching Electric's + * `runTags @> ARRAY[...]` shape. */ export class ClickHouseRunListResolver implements RunListResolver { constructor(private readonly options: ClickHouseRunListResolverOptions) {} @@ -26,6 +26,8 @@ export class ClickHouseRunListResolver implements RunListResolver { projectId: filter.projectId, environmentId: filter.environmentId, tags: filter.tags && filter.tags.length > 0 ? filter.tags : undefined, + // Contains-ALL, matching the Electric shape's `runTags @> ARRAY[...]` semantics. + tagsMatch: "all", batchId: filter.batchId, from: filter.createdAtAfter?.getTime(), page: { size: filter.limit }, diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts index 7fac5cf5dd6..166a03d38b7 100644 --- a/apps/webapp/app/services/realtime/envChangeRouter.server.ts +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -488,12 +488,13 @@ export class EnvChangeRouter { } } - /** Authoritative re-check for tag feeds: the hydrated row's tags intersect the filter and its createdAt is within the feed's window. */ + /** Authoritative re-check for tag feeds: the hydrated row carries ALL the filter's tags + * (Electric's `runTags @> ARRAY[...]` semantics) and its createdAt is within the window. */ #tagRowMatches(row: RealtimeRunRow, filter: Extract): boolean { if (filter.createdAtFloorMs !== undefined && row.createdAt.getTime() < filter.createdAtFloorMs) { return false; } const rowTags = row.runTags ?? []; - return filter.tags.some((tag) => rowTags.includes(tag)); + return filter.tags.every((tag) => rowTags.includes(tag)); } } diff --git a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts index fcf1c811d70..304777b39e0 100644 --- a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts +++ b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts @@ -321,7 +321,9 @@ function applyRunFiltersToQueryBuilder( } if (options.tags && options.tags.length > 0) { - queryBuilder.where("hasAny(tags, {tags: Array(String)})", { tags: options.tags }); + // Both hasAny and hasAll are served by the tags bloom_filter skip index. + const tagsFn = options.tagsMatch === "all" ? "hasAll" : "hasAny"; + queryBuilder.where(`${tagsFn}(tags, {tags: Array(String)})`, { tags: options.tags }); } if (options.scheduleId) { diff --git a/apps/webapp/app/services/runsRepository/runsRepository.server.ts b/apps/webapp/app/services/runsRepository/runsRepository.server.ts index f4eeb5466d0..74963bc3ff2 100644 --- a/apps/webapp/app/services/runsRepository/runsRepository.server.ts +++ b/apps/webapp/app/services/runsRepository/runsRepository.server.ts @@ -30,6 +30,8 @@ const RunListInputOptionsSchema = z.object({ versions: z.array(z.string()).optional(), statuses: z.array(RunStatus).optional(), tags: z.array(z.string()).optional(), + // "any" (default) = run has at least one of `tags`; "all" = run has every tag. + tagsMatch: z.enum(["any", "all"]).optional(), scheduleId: z.string().optional(), period: z.string().optional(), from: z.number().optional(), diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts index 46457fa2ca9..33603728ec6 100644 --- a/apps/webapp/test/realtime/envChangeRouter.test.ts +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -151,6 +151,25 @@ describe("EnvChangeRouter", () => { reg.close(); }); + it("multi-tag feeds require ALL tags on the row (Electric contains-all semantics)", async () => { + const rows = new Map([ + ["r_both", row("r_both", { tags: ["a", "b", "c"] })], + ["r_one", row("r_one", { tags: ["a"] })], + ]); + const { router, src } = makeRouter(rows); + const reg = router.register("env_1", { kind: "tag", tags: ["a", "b"] }, []); + const wait = reg.waitForMatch(undefined, 1_000); + + // r_one shares a tag (routes as a candidate via the index) but lacks "b" — must be + // culled by the authoritative row check. r_both carries both and wakes the feed. + src.push("env_1", [record("r_one", { tags: ["a"] }), record("r_both", { tags: ["a", "b", "c"] })]); + + const result = await wait; + expect(result.reason).toBe("notify"); + expect(result.rows.map((m) => m.row.id)).toEqual(["r_both"]); + reg.close(); + }); + it("drops a tag match created before the feed's createdAt floor", async () => { const rows = new Map([["r1", row("r1", { tags: ["a"], createdAtMs: FLOOR_MS - 10_000 })]]); const { router, src } = makeRouter(rows); From 1b82875965a78edf426e508165fe52f61697e293 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 15:42:26 +0100 Subject: [PATCH 18/23] feat(webapp): add delivery-lag and health metrics to the realtime backend New metrics covering the questions an operator asks during rollout: an end-to-end delivery-lag histogram per emission path, a backstop outcome split (sustained delivered counts mean live wakes are being missed), publish success/failure and coalesce-ratio counters on the pub/sub side, held-feed and active-environment gauges as the capacity signal, concurrency-limiter rejections, replay-buffer evictions split by cause, and a rows-per-emission histogram that exposes full-set re-emission regressions. --- .../realtime/envChangeRouter.server.ts | 15 +++++ .../realtime/nativeRealtimeClient.server.ts | 19 ++++++ .../nativeRealtimeClientInstance.server.ts | 64 +++++++++++++++++++ .../realtime/runChangeNotifier.server.ts | 29 +++++++-- .../runChangeNotifierInstance.server.ts | 24 ++++++- .../test/realtime/envChangeRouter.test.ts | 7 +- .../test/realtime/nativeHoldOnEmpty.test.ts | 15 ++++- 7 files changed, 162 insertions(+), 11 deletions(-) diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts index 166a03d38b7..54ec9e3fea8 100644 --- a/apps/webapp/app/services/realtime/envChangeRouter.server.ts +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -48,6 +48,9 @@ export type EnvChangeRouterOptions = { unsubscribeLingerMs?: number; /** Observability: a replay scan found candidates and delivered rows (or none survived). */ onReplay?: (result: "delivered" | "empty") => void; + /** Observability: a buffered record was evicted. `cap` evictions mean the env churns more + * runs inside the window than the buffer holds (the replay guarantee is degrading). */ + onReplayEviction?: (reason: "cap" | "window") => void; }; const DEFAULT_REPLAY_WINDOW_MS = 2_000; @@ -210,6 +213,17 @@ export class EnvChangeRouter { return this.#envs.size; } + /** Currently-held feeds by kind (for metrics) — the system's capacity unit. */ + get heldFeedCounts(): { run: number; tag: number; batch: number } { + const counts = { run: 0, tag: 0, batch: 0 }; + for (const env of this.#envs.values()) { + for (const feed of env.feeds) { + counts[feed.filter.kind]++; + } + } + return counts; + } + #ensureEnv(environmentId: string): EnvState { const existing = this.#envs.get(environmentId); if (existing) { @@ -282,6 +296,7 @@ export class EnvChangeRouter { if (entry.receivedAtMs >= cutoff && env.recent.size <= maxRuns) { break; } + this.options.onReplayEviction?.(entry.receivedAtMs < cutoff ? "window" : "cap"); env.recent.delete(runId); } } diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts index 41e2944924e..bd693ec2788 100644 --- a/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts @@ -116,6 +116,14 @@ export type NativeRealtimeClientOptions = { onRunSetQuery?: (stage: "resolve" | "hydrate", ms: number) => void; /** Observability hook: a fresh resolve waited `ms` for an admission permit (only when the gate engaged). */ onResolveAdmissionWait?: (ms: number) => void; + /** Observability hook: a live emission left the server — lag is now minus the newest + * emitted row's updatedAt (the end-to-end delivery SLI), rowCount the delta size. */ + onEmit?: (path: LivePollPath, lagMs: number, rowCount: number) => void; + /** Observability hook: a backstop resolve found missed changes (delivered) or nothing + * (empty). Sustained `delivered` means the notify/replay path is leaking. */ + onBackstopResult?: (result: "delivered" | "empty") => void; + /** Observability hook: a poll was rejected by the per-env concurrency limiter (429). */ + onConcurrencyRejected?: () => void; }; const DEFAULT_CONCURRENCY_LIMIT = 100_000; @@ -414,6 +422,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { const probed = await this.options.runReader.getRunById(environment.id, runId); if (probed && probed.updatedAt.getTime() > lastSeenMs) { const seq = this.#nextSeq(); + this.options.onEmit?.("cold-resolve", Date.now() - probed.updatedAt.getTime(), 1); return this.#buildResponse( buildUpdateBody(probed, skipColumns), apiVersion, @@ -450,6 +459,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { const updatedAtMs = matched.row.updatedAt.getTime(); if (updatedAtMs > lastSeenMs) { const seq = this.#nextSeq(); + this.options.onEmit?.("fast-hydrate", Date.now() - updatedAtMs, 1); return this.#buildResponse( buildRowsBodyFromSerialized([ { runId: matched.row.id, value: matched.value, operation: "update" }, @@ -469,6 +479,8 @@ export class NativeRealtimeClient implements RealtimeStreamClient { const row = await this.options.runReader.getRunById(environment.id, runId); const seq = this.#nextSeq(); if (row && row.updatedAt.getTime() > lastSeenMs) { + this.options.onBackstopResult?.("delivered"); + this.options.onEmit?.("full-resolve", Date.now() - row.updatedAt.getTime(), 1); return this.#buildResponse( buildUpdateBody(row, skipColumns), apiVersion, @@ -480,6 +492,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { } ); } + this.options.onBackstopResult?.("empty"); return this.#buildResponse(buildUpToDateBody(), apiVersion, clientVersion, { offset, handle, @@ -601,6 +614,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { this.#workingSetCache.set(workingSetKey, touched); prevSeen = touched; if (changes.length > 0) { + this.options.onEmit?.("cold-resolve", Date.now() - maxUpdatedAt, changes.length); return emitFromRows(changes, maxUpdatedAt); } continue; // nothing was missed — hold as usual @@ -633,6 +647,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { prevSeen = merged; if (changes.length > 0) { + this.options.onEmit?.("fast-hydrate", Date.now() - maxUpdatedAt, changes.length); return emitFromSerialized(changes, maxUpdatedAt); } // Matched but no row advanced (already seen). Keep holding. @@ -655,10 +670,13 @@ export class NativeRealtimeClient implements RealtimeStreamClient { prevSeen = touched; if (changes.length > 0) { + this.options.onBackstopResult?.("delivered"); + this.options.onEmit?.("full-resolve", Date.now() - maxUpdatedAt, changes.length); return emitFromRows(changes, maxUpdatedAt); } // Empty backstop diff: timeout returns up-to-date; (holdOnEmpty never reaches // here on a notify — those are handled in the fast path above). + this.options.onBackstopResult?.("empty"); return emitUpToDate(maxUpdatedAt); } } finally { @@ -946,6 +964,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { ); if (!canProceed) { + this.options.onConcurrencyRejected?.(); return json({ error: "Too many concurrent requests" }, { status: 429 }); } diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts index af1c214c7ec..1fc164ea6e0 100644 --- a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts @@ -63,6 +63,41 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { registers: [metricsRegister], }); + const deliveryLagMs = new Histogram({ + name: "realtime_native_delivery_lag_ms", + help: "Live emissions: now minus the newest emitted row's updatedAt (PG clock vs app clock, so approximate). The end-to-end delivery SLI — a p99 near the backstop hold means wakes are being missed.", + labelNames: ["path"] as const, + buckets: [5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000, 10_000, 30_000], + registers: [metricsRegister], + }); + + const emittedRows = new Histogram({ + name: "realtime_native_emitted_rows", + help: "Rows per live emission. Deltas should be small; a fat tail means working-set/offset-floor fallbacks are re-emitting full sets.", + buckets: [1, 2, 5, 10, 25, 50, 100, 250, 1_000], + registers: [metricsRegister], + }); + + const backstops = new Counter({ + name: "realtime_native_backstop_total", + help: "Backstop full resolves by outcome. 'empty' is normal idle behavior; sustained 'delivered' means the notify/replay path missed changes — alert on it.", + labelNames: ["result"] as const, + registers: [metricsRegister], + }); + + const concurrencyRejections = new Counter({ + name: "realtime_native_concurrency_rejections_total", + help: "Polls rejected (429) by the per-env concurrency limiter.", + registers: [metricsRegister], + }); + + const replayEvictions = new Counter({ + name: "realtime_native_replay_evictions_total", + help: "Replay-buffer evictions. 'window' expiry is normal; 'cap' means an env churns more runs inside the window than the buffer holds (replay guarantee degrading — retune the knobs).", + labelNames: ["reason"] as const, + registers: [metricsRegister], + }); + const limiter = new RealtimeConcurrencyLimiter({ keyPrefix: "tr:realtime:native:concurrency", redis: { @@ -90,6 +125,7 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { replayMaxRunsPerEnv: env.REALTIME_BACKEND_NATIVE_REPLAY_MAX_RUNS, unsubscribeLingerMs: env.REALTIME_BACKEND_NATIVE_UNSUBSCRIBE_LINGER_MS, onReplay: (result) => replays.inc({ result }), + onReplayEviction: (reason) => replayEvictions.inc({ reason }), }); const client = new NativeRealtimeClient({ @@ -128,6 +164,12 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { onRunSetResolve: (result) => runSetResolves.inc({ result }), onRunSetQuery: (stage, ms) => runSetQueryMs.observe({ stage }, ms), onResolveAdmissionWait: () => resolveAdmissionWaits.inc(), + onEmit: (path, lagMs, rowCount) => { + deliveryLagMs.observe({ path }, Math.max(lagMs, 0)); + emittedRows.observe(rowCount); + }, + onBackstopResult: (result) => backstops.inc({ result }), + onConcurrencyRejected: () => concurrencyRejections.inc(), }); new Gauge({ @@ -148,6 +190,28 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { }, }); + new Gauge({ + name: "realtime_native_held_feeds", + help: "Long-polls currently held, by feed kind — the system's capacity unit.", + labelNames: ["kind"] as const, + registers: [metricsRegister], + collect() { + const counts = router.heldFeedCounts; + this.set({ kind: "run" }, counts.run); + this.set({ kind: "tag" }, counts.tag); + this.set({ kind: "batch" }, counts.batch); + }, + }); + + new Gauge({ + name: "realtime_native_active_envs", + help: "Environments currently routed on this instance (held feeds + lingering subscriptions).", + registers: [metricsRegister], + collect() { + this.set(router.activeEnvCount); + }, + }); + return client; } diff --git a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts index 58b1e5bb931..f295c02d3f8 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifier.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifier.server.ts @@ -60,6 +60,12 @@ export type RunChangeNotifierOptions = { envWakeCoalesceWindowMs?: number; /** Use Redis sharded pub/sub (SSUBSCRIBE/SPUBLISH); cluster-only and requires `clusterOptions.shardedSubscribers`. Defaults to false (classic). */ shardedPubSub?: boolean; + /** Observability hook: a publish settled (ok) or failed (the leading degradation signal). */ + onPublishResult?: (ok: boolean) => void; + /** Observability hook: a raw channel message arrived (pre-coalesce). */ + onMessageReceived?: () => void; + /** Observability hook: a coalesced batch was delivered to listeners (records per batch). */ + onBatchDelivered?: (recordCount: number) => void; }; const DEFAULT_CHANNEL_PREFIX = "realtime:"; @@ -115,15 +121,22 @@ export class RunChangeNotifier { const result = this.#sharded ? publisher.spublish(channel, payload) : publisher.publish(channel, payload); - if (typeof (result as Promise)?.catch === "function") { - (result as Promise).catch((error) => { - logger.error("[runChangeNotifier] Failed to publish run-changed notification", { - error, - channel, - }); - }); + if (typeof (result as Promise)?.then === "function") { + (result as Promise).then( + () => this.options.onPublishResult?.(true), + (error) => { + this.options.onPublishResult?.(false); + logger.error("[runChangeNotifier] Failed to publish run-changed notification", { + error, + channel, + }); + } + ); + } else { + this.options.onPublishResult?.(true); } } catch (error) { + this.options.onPublishResult?.(false); logger.error("[runChangeNotifier] Failed to publish run-changed notification", { error, channel, @@ -241,6 +254,7 @@ export class RunChangeNotifier { } #onMessage(channel: string, message: string) { + this.options.onMessageReceived?.(); // Accumulate the decoded record (deduped by runId) before delivering, so a coalesced // wake carries every run that moved during the window. this.#addPending(channel, decodeChangeRecord(message)); @@ -274,6 +288,7 @@ export class RunChangeNotifier { if (!listeners || batch.length === 0) { return; } + this.options.onBatchDelivered?.(batch.length); for (const onBatch of [...listeners]) { onBatch(batch); } diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index c24a822ba4e..bf0cb456e23 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -1,4 +1,4 @@ -import { Gauge } from "prom-client"; +import { Counter, Gauge } from "prom-client"; import { env } from "~/env.server"; import { metricsRegister } from "~/metrics.server"; import { singleton } from "~/utils/singleton"; @@ -16,6 +16,25 @@ function initializeRunChangeNotifier(): RunChangeNotifier { // broadcast every message to every node, so this is what actually shards load. const shardedPubSub = clusterMode && env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_SHARDED_ENABLED === "1"; + const publishes = new Counter({ + name: "realtime_run_change_notifier_publishes_total", + help: "Change-record publishes by outcome. Failures are the leading indicator that feeds are degrading to their backstops (pub/sub Redis trouble).", + labelNames: ["result"] as const, + registers: [metricsRegister], + }); + + const received = new Counter({ + name: "realtime_run_change_notifier_messages_received_total", + help: "Raw channel messages received by this instance's subscriber, pre-coalesce.", + registers: [metricsRegister], + }); + + const delivered = new Counter({ + name: "realtime_run_change_notifier_batches_delivered_total", + help: "Coalesced batches delivered to listeners. received/batches = the coalesce ratio (how hard a busy env is being collapsed).", + registers: [metricsRegister], + }); + const notifier = new RunChangeNotifier({ redis: { host: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_HOST, @@ -29,6 +48,9 @@ function initializeRunChangeNotifier(): RunChangeNotifier { }, envWakeCoalesceWindowMs: env.REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS, shardedPubSub, + onPublishResult: (ok) => publishes.inc({ result: ok ? "ok" : "error" }), + onMessageReceived: () => received.inc(), + onBatchDelivered: () => delivered.inc(), }); new Gauge({ diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts index 33603728ec6..b9688d98f55 100644 --- a/apps/webapp/test/realtime/envChangeRouter.test.ts +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -317,7 +317,11 @@ describe("EnvChangeRouter", () => { ["r2", row("r2")], ["r3", row("r3")], ]); - const { router, src, hydrateSpy } = makeRouter(rows, { replayMaxRunsPerEnv: 2 }); + const evictions: string[] = []; + const { router, src, hydrateSpy } = makeRouter(rows, { + replayMaxRunsPerEnv: 2, + onReplayEviction: (reason: string) => evictions.push(reason), + }); const reg = router.register("env_1", { kind: "batch", batchId: "batch_1" }, []); src.push("env_1", [ record("r1", { batchId: "batch_1" }), @@ -329,6 +333,7 @@ describe("EnvChangeRouter", () => { expect(result.reason).toBe("notify"); // r1 was evicted by the cap; only the newest two replay. expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["r2", "r3"], []); + expect(evictions).toEqual(["cap"]); reg.close(); }); }); diff --git a/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts index 43238797d3a..615abc90394 100644 --- a/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts +++ b/apps/webapp/test/realtime/nativeHoldOnEmpty.test.ts @@ -120,7 +120,10 @@ const isUpToDate = (body: Awaited>) => describe("NativeRealtimeClient multi-run live path over the router", () => { it("a matching change hydrates by id (no ClickHouse) and returns a delta", async () => { - const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient(); + const emits: Array<[string, number, number]> = []; + const { client, src, hydrateSpy, resolveSpy, setRows } = makeClient({ + onEmit: (path: string, lagMs: number, rows: number) => emits.push([path, lagMs, rows]), + }); setRows([row("run_1", FLOOR_MS + 5_000, { tags: ["t"] })]); const responsePromise = liveRuns(client); @@ -132,6 +135,9 @@ describe("NativeRealtimeClient multi-run live path over the router", () => { expect(hasRowOp(await bodyOf(res))).toBe(true); expect(resolveSpy).not.toHaveBeenCalled(); // ClickHouse skipped expect(hydrateSpy).toHaveBeenCalledWith("env_1", ["run_1"], expect.anything()); + expect(emits).toHaveLength(1); + expect(emits[0][0]).toBe("fast-hydrate"); + expect(emits[0][2]).toBe(1); // one delta row }); it("a change that doesn't match the filter never wakes the feed (no CH, no PG); a later match does", async () => { @@ -175,11 +181,16 @@ describe("NativeRealtimeClient multi-run live path over the router", () => { }); it("the backstop timeout does a full ClickHouse resolve and returns up-to-date", async () => { - const { client, resolveSpy } = makeClient({ livePollTimeoutMs: 50 }); + const backstopResults: string[] = []; + const { client, resolveSpy } = makeClient({ + livePollTimeoutMs: 50, + onBackstopResult: (r: string) => backstopResults.push(r), + }); const res = await liveRuns(client); // never pushed -> backstop fires expect(res.status).toBe(200); expect(isUpToDate(await bodyOf(res))).toBe(true); expect(resolveSpy).toHaveBeenCalled(); + expect(backstopResults).toEqual(["empty"]); }); it("a cold env registration resolves immediately instead of holding blind", async () => { From 337a4f0460636573ab6f9965431e147ede60a19c Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 15:56:35 +0100 Subject: [PATCH 19/23] fix(webapp): emit realtime backend metrics through OpenTelemetry The realtime backend's metrics were registered with the in-process Prometheus registry, which is no longer how the webapp ships metrics. They now emit through the OpenTelemetry meter (realtime_native.*, realtime_notifier.*, realtime_shadow.*) so they flow through the internal metrics exporter like the rest of the webapp's instrumentation. Gauges become observable gauges sampled at export time; names move from prom-style _total suffixes to the meter's dot-namespaced convention. --- .../nativeRealtimeClientInstance.server.ts | 197 ++++++++---------- .../runChangeNotifierInstance.server.ts | 45 ++-- .../shadowRealtimeClientInstance.server.ts | 26 ++- 3 files changed, 115 insertions(+), 153 deletions(-) diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts index 1fc164ea6e0..948d791f850 100644 --- a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts @@ -1,7 +1,6 @@ -import { Counter, Gauge, Histogram } from "prom-client"; +import { getMeter } from "@internal/tracing"; import { $replica } from "~/db.server"; import { env } from "~/env.server"; -import { metricsRegister } from "~/metrics.server"; import { singleton } from "~/utils/singleton"; import { getCachedLimit } from "../platform.v3.server"; import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; @@ -15,87 +14,67 @@ import { RunHydrator } from "./runReader.server"; // Process-singleton wiring for the native realtime client; only constructed when a // request actually routes to it, so a disabled webapp never instantiates it. function initializeNativeRealtimeClient(): NativeRealtimeClient { - const wakeups = new Counter({ - name: "realtime_native_wakeups_total", - help: "Live realtime wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishChangeRecord delegate.", - labelNames: ["reason"] as const, - registers: [metricsRegister], + const meter = getMeter("realtime-native"); + + const wakeups = meter.createCounter("realtime_native.wakeups", { + description: + "Live realtime wakeups by reason. A rising 'timeout' share suggests a write site is missing its publishChangeRecord delegate.", }); - const runSetResolves = new Counter({ - name: "realtime_native_runset_resolve_total", - help: "Multi-run (tag-list/batch) resolve+hydrate outcomes. 'hit'/'coalesced' vs 'miss' shows how effectively concurrent same-filter feeds share a single ClickHouse + Postgres query under an env-wide wake.", - labelNames: ["result"] as const, - registers: [metricsRegister], + const runSetResolves = meter.createCounter("realtime_native.runset_resolves", { + description: + "Multi-run (tag-list/batch) resolve+hydrate outcomes. 'hit'/'coalesced' vs 'miss' shows how effectively concurrent same-filter feeds share a single ClickHouse + Postgres query.", }); - const runSetQueryMs = new Histogram({ - name: "realtime_native_runset_query_ms", - help: "Latency of the multi-run resolve (ClickHouse) and hydrate (Postgres) stages.", - labelNames: ["stage"] as const, - buckets: [1, 5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000], - registers: [metricsRegister], + const runSetQueryMs = meter.createHistogram("realtime_native.runset_query_ms", { + description: "Latency of the multi-run resolve (ClickHouse) and hydrate (Postgres) stages.", + unit: "ms", }); - const livePollPaths = new Counter({ - name: "realtime_native_live_poll_total", - help: "How live polls resolved. 'fast-hydrate' = the router woke the feed with matched runs hydrated by id (no ClickHouse); 'full-resolve' = the backstop timeout did a ClickHouse resolve. A high fast-path share is the local-membership routing working.", - labelNames: ["path"] as const, - registers: [metricsRegister], + const livePollPaths = meter.createCounter("realtime_native.live_polls", { + description: + "How live polls resolved. 'fast-hydrate' = router wake with rows hydrated by id (no ClickHouse); 'full-resolve' = backstop; 'cold-resolve' = fresh env subscription probed once.", }); - const routerHydrates = new Counter({ - name: "realtime_native_router_hydrated_runs_total", - help: "Runs hydrated by the EnvChangeRouter's batch-hydrate (one query per column set per wake, shared across all feeds matching the same run — the hot-shared-tag fan-out collapse).", - registers: [metricsRegister], + const routerHydrates = meter.createCounter("realtime_native.router_hydrated_runs", { + description: + "Runs hydrated by the EnvChangeRouter's batch-hydrate (one query per column set per wake, shared across all feeds matching the same run).", }); - const resolveAdmissionWaits = new Counter({ - name: "realtime_native_resolve_admission_waits_total", - help: "Fresh ClickHouse resolves that had to queue for an admission permit. A rising count means a distinct-filter reconnect stampede is being throttled (the gate is doing its job).", - registers: [metricsRegister], + const resolveAdmissionWaits = meter.createCounter("realtime_native.resolve_admission_waits", { + description: + "Fresh ClickHouse resolves that had to queue for an admission permit. A rising count means a distinct-filter reconnect stampede is being throttled (the gate is doing its job).", }); - const replays = new Counter({ - name: "realtime_native_replays_total", - help: "Buffered change records replayed to a newly-armed feed (inter-poll gap recovery). 'delivered' = rows reached the feed; 'empty' = candidates hydrated but none survived the filter/diff.", - labelNames: ["result"] as const, - registers: [metricsRegister], + const replays = meter.createCounter("realtime_native.replays", { + description: + "Buffered change records replayed to a newly-armed feed (inter-poll gap recovery). 'delivered' = rows reached the feed; 'empty' = candidates hydrated but none survived the filter/diff.", }); - const deliveryLagMs = new Histogram({ - name: "realtime_native_delivery_lag_ms", - help: "Live emissions: now minus the newest emitted row's updatedAt (PG clock vs app clock, so approximate). The end-to-end delivery SLI — a p99 near the backstop hold means wakes are being missed.", - labelNames: ["path"] as const, - buckets: [5, 10, 25, 50, 100, 250, 500, 1_000, 2_500, 5_000, 10_000, 30_000], - registers: [metricsRegister], + const replayEvictions = meter.createCounter("realtime_native.replay_evictions", { + description: + "Replay-buffer evictions. 'window' expiry is normal; 'cap' means an env churns more runs inside the window than the buffer holds (replay guarantee degrading — retune the knobs).", }); - const emittedRows = new Histogram({ - name: "realtime_native_emitted_rows", - help: "Rows per live emission. Deltas should be small; a fat tail means working-set/offset-floor fallbacks are re-emitting full sets.", - buckets: [1, 2, 5, 10, 25, 50, 100, 250, 1_000], - registers: [metricsRegister], + const deliveryLagMs = meter.createHistogram("realtime_native.delivery_lag_ms", { + description: + "Live emissions: now minus the newest emitted row's updatedAt (PG clock vs app clock, so approximate). The end-to-end delivery SLI — a p99 near the backstop hold means wakes are being missed.", + unit: "ms", }); - const backstops = new Counter({ - name: "realtime_native_backstop_total", - help: "Backstop full resolves by outcome. 'empty' is normal idle behavior; sustained 'delivered' means the notify/replay path missed changes — alert on it.", - labelNames: ["result"] as const, - registers: [metricsRegister], + const emittedRows = meter.createHistogram("realtime_native.emitted_rows", { + description: + "Rows per live emission. Deltas should be small; a fat tail means working-set/offset-floor fallbacks are re-emitting full sets.", + unit: "rows", }); - const concurrencyRejections = new Counter({ - name: "realtime_native_concurrency_rejections_total", - help: "Polls rejected (429) by the per-env concurrency limiter.", - registers: [metricsRegister], + const backstops = meter.createCounter("realtime_native.backstops", { + description: + "Backstop full resolves by outcome. 'empty' is normal idle behavior; sustained 'delivered' means the notify/replay path missed changes — alert on it.", }); - const replayEvictions = new Counter({ - name: "realtime_native_replay_evictions_total", - help: "Replay-buffer evictions. 'window' expiry is normal; 'cap' means an env churns more runs inside the window than the buffer holds (replay guarantee degrading — retune the knobs).", - labelNames: ["reason"] as const, - registers: [metricsRegister], + const concurrencyRejections = meter.createCounter("realtime_native.concurrency_rejections", { + description: "Polls rejected (429) by the per-env concurrency limiter.", }); const limiter = new RealtimeConcurrencyLimiter({ @@ -120,12 +99,12 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { const router = new EnvChangeRouter({ source: getRunChangeNotifier(), hydrator: runReader, - onHydrate: (runCount) => routerHydrates.inc(runCount), + onHydrate: (runCount) => routerHydrates.add(runCount), replayWindowMs: env.REALTIME_BACKEND_NATIVE_REPLAY_WINDOW_MS, replayMaxRunsPerEnv: env.REALTIME_BACKEND_NATIVE_REPLAY_MAX_RUNS, unsubscribeLingerMs: env.REALTIME_BACKEND_NATIVE_UNSUBSCRIBE_LINGER_MS, - onReplay: (result) => replays.inc({ result }), - onReplayEviction: (reason) => replayEvictions.inc({ reason }), + onReplay: (result) => replays.add(1, { result }), + onReplayEviction: (reason) => replayEvictions.add(1, { reason }), }); const client = new NativeRealtimeClient({ @@ -159,58 +138,50 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { runSetCreatedAtBucketMs: env.REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS, holdOnEmpty: env.REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY === "1", resolveAdmissionLimit: env.REALTIME_BACKEND_NATIVE_RESOLVE_ADMISSION_LIMIT, - onWakeup: (reason) => wakeups.inc({ reason }), - onLivePollPath: (path) => livePollPaths.inc({ path }), - onRunSetResolve: (result) => runSetResolves.inc({ result }), - onRunSetQuery: (stage, ms) => runSetQueryMs.observe({ stage }, ms), - onResolveAdmissionWait: () => resolveAdmissionWaits.inc(), + onWakeup: (reason) => wakeups.add(1, { reason }), + onLivePollPath: (path) => livePollPaths.add(1, { path }), + onRunSetResolve: (result) => runSetResolves.add(1, { result }), + onRunSetQuery: (stage, ms) => runSetQueryMs.record(ms, { stage }), + onResolveAdmissionWait: () => resolveAdmissionWaits.add(1), onEmit: (path, lagMs, rowCount) => { - deliveryLagMs.observe({ path }, Math.max(lagMs, 0)); - emittedRows.observe(rowCount); - }, - onBackstopResult: (result) => backstops.inc({ result }), - onConcurrencyRejected: () => concurrencyRejections.inc(), - }); - - new Gauge({ - name: "realtime_native_working_set_size", - help: "Entries in the per-handle working-set cache (one per active multi-run feed session).", - registers: [metricsRegister], - collect() { - this.set(client.workingSetCacheSize); - }, - }); - - new Gauge({ - name: "realtime_native_resolve_admission_in_use", - help: "Fresh ClickHouse resolves currently holding an admission permit (live concurrency against the gate's limit).", - registers: [metricsRegister], - collect() { - this.set(client.resolveAdmissionInUse); + deliveryLagMs.record(Math.max(lagMs, 0), { path }); + emittedRows.record(rowCount); }, - }); - - new Gauge({ - name: "realtime_native_held_feeds", - help: "Long-polls currently held, by feed kind — the system's capacity unit.", - labelNames: ["kind"] as const, - registers: [metricsRegister], - collect() { + onBackstopResult: (result) => backstops.add(1, { result }), + onConcurrencyRejected: () => concurrencyRejections.add(1), + }); + + meter + .createObservableGauge("realtime_native.working_set_size", { + description: + "Entries in the per-handle working-set cache (one per active multi-run feed session).", + }) + .addCallback((result) => result.observe(client.workingSetCacheSize)); + + meter + .createObservableGauge("realtime_native.resolve_admission_in_use", { + description: + "Fresh ClickHouse resolves currently holding an admission permit (live concurrency against the gate's limit).", + }) + .addCallback((result) => result.observe(client.resolveAdmissionInUse)); + + meter + .createObservableGauge("realtime_native.held_feeds", { + description: "Long-polls currently held, by feed kind — the system's capacity unit.", + }) + .addCallback((result) => { const counts = router.heldFeedCounts; - this.set({ kind: "run" }, counts.run); - this.set({ kind: "tag" }, counts.tag); - this.set({ kind: "batch" }, counts.batch); - }, - }); - - new Gauge({ - name: "realtime_native_active_envs", - help: "Environments currently routed on this instance (held feeds + lingering subscriptions).", - registers: [metricsRegister], - collect() { - this.set(router.activeEnvCount); - }, - }); + result.observe(counts.run, { kind: "run" }); + result.observe(counts.tag, { kind: "tag" }); + result.observe(counts.batch, { kind: "batch" }); + }); + + meter + .createObservableGauge("realtime_native.active_envs", { + description: + "Environments currently routed on this instance (held feeds + lingering subscriptions).", + }) + .addCallback((result) => result.observe(router.activeEnvCount)); return client; } diff --git a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts index bf0cb456e23..b656052c339 100644 --- a/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts +++ b/apps/webapp/app/services/realtime/runChangeNotifierInstance.server.ts @@ -1,6 +1,5 @@ -import { Counter, Gauge } from "prom-client"; +import { getMeter } from "@internal/tracing"; import { env } from "~/env.server"; -import { metricsRegister } from "~/metrics.server"; import { singleton } from "~/utils/singleton"; import { RunChangeNotifier, type ChangeRecordInput } from "./runChangeNotifier.server"; @@ -16,23 +15,20 @@ function initializeRunChangeNotifier(): RunChangeNotifier { // broadcast every message to every node, so this is what actually shards load. const shardedPubSub = clusterMode && env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_SHARDED_ENABLED === "1"; - const publishes = new Counter({ - name: "realtime_run_change_notifier_publishes_total", - help: "Change-record publishes by outcome. Failures are the leading indicator that feeds are degrading to their backstops (pub/sub Redis trouble).", - labelNames: ["result"] as const, - registers: [metricsRegister], + const meter = getMeter("realtime-notifier"); + + const publishes = meter.createCounter("realtime_notifier.publishes", { + description: + "Change-record publishes by outcome. Failures are the leading indicator that feeds are degrading to their backstops (pub/sub Redis trouble).", }); - const received = new Counter({ - name: "realtime_run_change_notifier_messages_received_total", - help: "Raw channel messages received by this instance's subscriber, pre-coalesce.", - registers: [metricsRegister], + const received = meter.createCounter("realtime_notifier.messages_received", { + description: "Raw channel messages received by this instance's subscriber, pre-coalesce.", }); - const delivered = new Counter({ - name: "realtime_run_change_notifier_batches_delivered_total", - help: "Coalesced batches delivered to listeners. received/batches = the coalesce ratio (how hard a busy env is being collapsed).", - registers: [metricsRegister], + const delivered = meter.createCounter("realtime_notifier.batches_delivered", { + description: + "Coalesced batches delivered to listeners. received/batches = the coalesce ratio (how hard a busy env is being collapsed).", }); const notifier = new RunChangeNotifier({ @@ -48,19 +44,16 @@ function initializeRunChangeNotifier(): RunChangeNotifier { }, envWakeCoalesceWindowMs: env.REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS, shardedPubSub, - onPublishResult: (ok) => publishes.inc({ result: ok ? "ok" : "error" }), - onMessageReceived: () => received.inc(), - onBatchDelivered: () => delivered.inc(), + onPublishResult: (ok) => publishes.add(1, { result: ok ? "ok" : "error" }), + onMessageReceived: () => received.add(1), + onBatchDelivered: () => delivered.add(1), }); - new Gauge({ - name: "realtime_run_change_notifier_active_subscriptions", - help: "Distinct runs currently subscribed for realtime change notifications", - collect() { - this.set(notifier.activeSubscriptionCount); - }, - registers: [metricsRegister], - }); + meter + .createObservableGauge("realtime_notifier.active_subscriptions", { + description: "Distinct env channels currently subscribed for realtime change notifications.", + }) + .addCallback((result) => result.observe(notifier.activeSubscriptionCount)); return notifier; } diff --git a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts index 041e8edd5d4..8dbb5007c20 100644 --- a/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/shadowRealtimeClientInstance.server.ts @@ -1,7 +1,6 @@ -import { Counter } from "prom-client"; +import { getMeter } from "@internal/tracing"; import { $replica } from "~/db.server"; import { env } from "~/env.server"; -import { metricsRegister } from "~/metrics.server"; import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; import { singleton } from "~/utils/singleton"; import { realtimeClient } from "../realtimeClientGlobal.server"; @@ -15,11 +14,9 @@ import { ShadowRealtimeClient } from "./shadowRealtimeClient.server"; * when an org's `realtimeBackend` flag is set to "shadow". */ function initializeShadowRealtimeClient(): ShadowRealtimeClient { - const compares = new Counter({ - name: "realtime_shadow_compare_total", - help: "Dual-run shadow-compare outcomes (Electric vs native). kind=serialization|membership, result=match|diverge|skew.", - labelNames: ["feed", "kind", "result"] as const, - registers: [metricsRegister], + const compares = getMeter("realtime-shadow").createCounter("realtime_shadow.compares", { + description: + "Dual-run shadow-compare outcomes (Electric vs native). kind=serialization|membership, result=match|diverge|skew.", }); const comparator = new RealtimeShadowComparator({ @@ -39,19 +36,20 @@ function initializeShadowRealtimeClient(): ShadowRealtimeClient { onOutcome: (outcome) => { const { feed } = outcome; if (outcome.serializationMatched) { - compares.inc({ feed, kind: "serialization", result: "match" }, outcome.serializationMatched); + compares.add(outcome.serializationMatched, { feed, kind: "serialization", result: "match" }); } if (outcome.serializationDiverged) { - compares.inc( - { feed, kind: "serialization", result: "diverge" }, - outcome.serializationDiverged - ); + compares.add(outcome.serializationDiverged, { + feed, + kind: "serialization", + result: "diverge", + }); } if (outcome.serializationSkew) { - compares.inc({ feed, kind: "serialization", result: "skew" }, outcome.serializationSkew); + compares.add(outcome.serializationSkew, { feed, kind: "serialization", result: "skew" }); } if (outcome.membershipMatch !== undefined) { - compares.inc({ + compares.add(1, { feed, kind: "membership", result: outcome.membershipMatch ? "match" : "diverge", From ff0e9ef70964dc3f0a78532034f74f70abf9d90a Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 16:38:58 +0100 Subject: [PATCH 20/23] feat(webapp): add a local Grafana dashboard for the realtime backend metrics Provisions a Realtime Native Backend dashboard in the local docker observability stack: delivery-lag percentiles, live polls by path, the backstop-delivered alarm, gap-recovery replays and evictions, rows per emission, held feeds, and resolve health. Works out of the box with docker:full once INTERNAL_OTEL_METRIC_EXPORTER_ENABLED points the webapp at the bundled collector. Also moves the local Grafana default port from 3001 to 4001 so it stops colliding with common dev-server ports. --- .../dashboards/realtime-native.json | 503 ++++++++++++++++++ docker/docker-compose.extras.yml | 2 +- 2 files changed, 504 insertions(+), 1 deletion(-) create mode 100644 docker/config/grafana/provisioning/dashboards/realtime-native.json diff --git a/docker/config/grafana/provisioning/dashboards/realtime-native.json b/docker/config/grafana/provisioning/dashboards/realtime-native.json new file mode 100644 index 00000000000..832f2c8e320 --- /dev/null +++ b/docker/config/grafana/provisioning/dashboards/realtime-native.json @@ -0,0 +1,503 @@ +{ + "title": "Realtime Native Backend", + "uid": "realtime-native", + "tags": [ + "trigger.dev", + "realtime" + ], + "timezone": "browser", + "schemaVersion": 39, + "refresh": "10s", + "time": { + "from": "now-30m", + "to": "now" + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "annotations": { + "list": [] + }, + "templating": { + "list": [] + }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "Delivery lag (write \u2192 emission)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.5, sum(rate(triggerdotdev_realtime_native_delivery_lag_ms_milliseconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p50" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(triggerdotdev_realtime_native_delivery_lag_ms_milliseconds_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ms", + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "The end-to-end SLI: now minus the newest emitted row's updatedAt. A p99 approaching the ~20s backstop hold means live wakes are being missed." + }, + { + "id": 2, + "type": "timeseries", + "title": "Live polls by path", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_live_polls_total[$__rate_interval])) by (path)", + "legendFormat": "{{path}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "fast-hydrate = router wake, no ClickHouse. full-resolve = backstop. cold-resolve = fresh env subscription probed (instance hop / first poll)." + }, + { + "id": 3, + "type": "stat", + "title": "Backstop DELIVERED (should be ~0)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 0 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_backstops_total{result=\"delivered\"}[5m])) or vector(0)", + "legendFormat": "delivered/s" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.01 + } + ] + } + }, + "overrides": [] + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ] + } + }, + "description": "A backstop that finds missed changes means the notify/replay path is leaking. Alert on sustained non-zero." + }, + { + "id": 4, + "type": "timeseries", + "title": "Wakeups by reason", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_wakeups_total[$__rate_interval])) by (reason)", + "legendFormat": "{{reason}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "notify = the architecture working. A rising timeout share with active traffic = publishes not routing." + }, + { + "id": 5, + "type": "timeseries", + "title": "Gap recovery: replays + evictions", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_replays_total[$__rate_interval])) by (result)", + "legendFormat": "replay {{result}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_replay_evictions_total[$__rate_interval])) by (reason)", + "legendFormat": "evict {{reason}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Replays recover records that landed between a connection's polls. 'evict cap' = an env churns more runs than the buffer window holds \u2014 retune REPLAY_MAX_RUNS / WINDOW_MS." + }, + { + "id": 6, + "type": "timeseries", + "title": "Rows per emission (p99)", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.99, sum(rate(triggerdotdev_realtime_native_emitted_rows_bucket[$__rate_interval])) by (le))", + "legendFormat": "p99 rows" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Deltas should be small. A fat tail means working-set / offset-floor fallbacks are re-emitting full sets." + }, + { + "id": 7, + "type": "timeseries", + "title": "Held feeds by kind", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_native_held_feeds) by (kind)", + "legendFormat": "{{kind}}" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Long-polls currently held \u2014 the capacity unit." + }, + { + "id": 8, + "type": "timeseries", + "title": "Envs + channel subscriptions", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_native_active_envs)", + "legendFormat": "routed envs" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_notifier_active_subscriptions)", + "legendFormat": "redis channels" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "Routed envs includes lingering subscriptions (kept alive briefly after the last feed closes)." + }, + { + "id": 9, + "type": "timeseries", + "title": "Resolve health", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_runset_resolves_total[$__rate_interval])) by (result)", + "legendFormat": "resolve {{result}}" + }, + { + "refId": "B", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(triggerdotdev_realtime_native_resolve_admission_in_use)", + "legendFormat": "gate in use" + }, + { + "refId": "C", + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(triggerdotdev_realtime_native_concurrency_rejections_total[$__rate_interval]))", + "legendFormat": "429/s" + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "lineWidth": 1, + "fillOpacity": 8, + "showPoints": "never" + } + }, + "overrides": [] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "description": "hit/coalesced vs miss = the single-flight cache collapsing same-filter herds. Gate in use near the limit = reconnect stampedes queueing." + } + ] +} \ No newline at end of file diff --git a/docker/docker-compose.extras.yml b/docker/docker-compose.extras.yml index 4c74c2acf70..cf16272dcc5 100644 --- a/docker/docker-compose.extras.yml +++ b/docker/docker-compose.extras.yml @@ -113,7 +113,7 @@ services: - grafana-data:/var/lib/grafana - ./config/grafana/provisioning:/etc/grafana/provisioning:ro ports: - - "${GRAFANA_HOST_PORT:-3001}:3000" + - "${GRAFANA_HOST_PORT:-4001}:3000" environment: GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD: admin From 4611cb4006de2bd133ed67c26a78842e33a876af Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 16:52:34 +0100 Subject: [PATCH 21/23] fix(webapp): wake unfiltered runs feeds on the native realtime backend Runs subscriptions with no tag filter were never indexed for change routing, so they only received updates from the periodic backstop poll instead of sub-second change notifications. Route every change record to zero-filter feeds and apply the same rule in the replay path. --- .../realtime/envChangeRouter.server.ts | 15 +++++++++--- .../test/realtime/envChangeRouter.test.ts | 24 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/apps/webapp/app/services/realtime/envChangeRouter.server.ts b/apps/webapp/app/services/realtime/envChangeRouter.server.ts index 54ec9e3fea8..587f0e55c5a 100644 --- a/apps/webapp/app/services/realtime/envChangeRouter.server.ts +++ b/apps/webapp/app/services/realtime/envChangeRouter.server.ts @@ -88,6 +88,8 @@ type EnvState = { byBatchId: Map>; /** All tag feeds, for routing partial records (no tags) as hydrate-to-classify candidates. */ tagFeeds: Set; + /** Tag feeds with no tag filter — they match every record but are unreachable via byTag. */ + unfilteredTagFeeds: Set; /** When this env's channel subscription started (for the gap-coverage check). */ subscribedAtMs: number; /** Latest record per run, insertion-ordered, for replaying inter-poll gaps to newly-armed feeds. */ @@ -241,6 +243,7 @@ export class EnvChangeRouter { byTag: new Map(), byBatchId: new Map(), tagFeeds: new Set(), + unfilteredTagFeeds: new Set(), subscribedAtMs: Date.now(), recent: new Map(), }; @@ -309,11 +312,11 @@ export class EnvChangeRouter { case "batch": return record.batchId != null && record.batchId === feed.filter.batchId; case "tag": { - // Partial record (no tags) = hydrate-to-classify candidate, like the live path. - if (record.tags === undefined) { + const tags = feed.filter.tags; + // Unfiltered feed matches everything; partial record (no tags) = hydrate-to-classify. + if (tags.length === 0 || record.tags === undefined) { return true; } - const tags = feed.filter.tags; return record.tags.some((tag) => tags.includes(tag)); } } @@ -368,6 +371,9 @@ export class EnvChangeRouter { break; case "tag": env.tagFeeds.add(feed); + if (feed.filter.tags.length === 0) { + env.unfilteredTagFeeds.add(feed); + } for (const tag of feed.filter.tags) { addToIndex(env.byTag, tag, feed); } @@ -385,6 +391,7 @@ export class EnvChangeRouter { break; case "tag": env.tagFeeds.delete(feed); + env.unfilteredTagFeeds.delete(feed); for (const tag of feed.filter.tags) { removeFromIndex(env.byTag, tag, feed); } @@ -436,6 +443,8 @@ export class EnvChangeRouter { addMatch(feed, record.runId); } } + // Unfiltered tag feeds match every record but live outside the index. + for (const feed of env.unfilteredTagFeeds) addMatch(feed, record.runId); } else { // Partial record (no membership data): route to every tag feed as a candidate to // hydrate-and-classify (rare; the publish side emits full records in practice). diff --git a/apps/webapp/test/realtime/envChangeRouter.test.ts b/apps/webapp/test/realtime/envChangeRouter.test.ts index b9688d98f55..6f2eb6df980 100644 --- a/apps/webapp/test/realtime/envChangeRouter.test.ts +++ b/apps/webapp/test/realtime/envChangeRouter.test.ts @@ -86,6 +86,30 @@ describe("EnvChangeRouter", () => { reg.close(); }); + it("wakes an unfiltered tag feed (no tags) for every full record, live and via replay", async () => { + const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); + const { router, src } = makeRouter(rows); + + // Live path: a full record (tags defined) must reach the zero-filter feed even + // though it can never appear in the byTag index. + const reg = router.register("env_1", { kind: "tag", tags: [] }, []); + const wait = reg.waitForMatch(undefined, 1_000); + src.push("env_1", [record("r1", { tags: ["a"] })]); + const live = await wait; + expect(live.reason).toBe("notify"); + expect(live.rows.map((m) => m.row.id)).toEqual(["r1"]); + reg.close(); + + // Replay path: the buffered record matches an unfiltered feed registered after the push. + const late = router.register("env_1", { kind: "tag", tags: [] }, [], { + replaySinceMs: Date.now() - 1_000, + }); + const replayed = await late.waitForMatch(undefined, 1_000); + expect(replayed.reason).toBe("notify"); + expect(replayed.rows.map((m) => m.row.id)).toEqual(["r1"]); + late.close(); + }); + it("batch-hydrates ONCE and shares the serialized value across feeds matching the same run", async () => { const rows = new Map([["r1", row("r1", { tags: ["a"] })]]); const { router, src, hydrateSpy } = makeRouter(rows); From c7e7b0a15228ddbdcf02347dd7e498e8b2f50b35 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 20:12:11 +0100 Subject: [PATCH 22/23] chore(webapp): raise the native realtime wake coalesce default to 250ms Halves wake and response volume on busy environments in exchange for a worst-case quarter second of added delivery delay. Tunable via REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS. --- apps/webapp/app/env.server.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 41431ba47f1..eb7474683c2 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -317,7 +317,7 @@ const EnvironmentSchema = z // Bucket (ms) the tag-list createdAt floor is quantized to so same-tag feeds share a cache entry; 0 disables. REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), // Leading-edge throttle (ms) on per-env wake delivery; 0 wakes on every change. - REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(100), + REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(250), // "1" holds a multi-run live poll open on a non-matching wake instead of replying up-to-date. REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY: z.string().default("1"), // Max concurrent fresh ClickHouse resolves per instance (reconnect-stampede gate); 0 disables. From 44d852b907349e26f6d1fa8cfc470c2f7554c507 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Wed, 10 Jun 2026 21:00:19 +0100 Subject: [PATCH 23/23] feat(webapp): share realtime replay cursors across instances A load balancer hop previously made a connection's inter-poll gap unprovable, forcing a cold resolve and a full-window replay on the new instance. Per-connection replay cursors (one timestamp each) now live in Redis behind REALTIME_BACKEND_NATIVE_SHARED_REPLAY_CURSORS (default on), so any instance can read the true gap. Store reads have a bounded deadline and degrade to the old cold-probe behavior on any Redis trouble. --- apps/webapp/app/env.server.ts | 2 + .../realtime/nativeRealtimeClient.server.ts | 30 ++-- .../nativeRealtimeClientInstance.server.ts | 25 +++ .../realtime/replayCursorStore.server.ts | 145 ++++++++++++++++++ .../test/realtime/replayCursorStore.test.ts | 141 +++++++++++++++++ 5 files changed, 333 insertions(+), 10 deletions(-) create mode 100644 apps/webapp/app/services/realtime/replayCursorStore.server.ts create mode 100644 apps/webapp/test/realtime/replayCursorStore.test.ts diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index eb7474683c2..38cbf6e07db 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -318,6 +318,8 @@ const EnvironmentSchema = z REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS: z.coerce.number().int().default(60_000), // Leading-edge throttle (ms) on per-env wake delivery; 0 wakes on every change. REALTIME_BACKEND_NATIVE_ENV_WAKE_COALESCE_WINDOW_MS: z.coerce.number().int().default(250), + // "1" shares per-connection replay cursors fleet-wide via Redis, so a load-balancer hop reads the connection's true inter-poll gap instead of cold-resolving. + REALTIME_BACKEND_NATIVE_SHARED_REPLAY_CURSORS: z.string().default("1"), // "1" holds a multi-run live poll open on a non-matching wake instead of replying up-to-date. REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY: z.string().default("1"), // Max concurrent fresh ClickHouse resolves per instance (reconnect-stampede gate); 0 disables. diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts index bd693ec2788..00e50ed9fcc 100644 --- a/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClient.server.ts @@ -33,6 +33,7 @@ import { } from "./envChangeRouter.server"; import { type RunHydrator, type RunListResolver } from "./runReader.server"; import { type RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; +import { InMemoryReplayCursorStore, type ReplayCursorStore } from "./replayCursorStore.server"; /** Widened with projectId so the tag-list feed can resolve ids via ClickHouse (needs org + project + env). */ export type RealtimeListEnvironment = RealtimeEnvironment & { projectId: string }; @@ -106,6 +107,10 @@ export type NativeRealtimeClientOptions = { holdOnEmpty?: boolean; /** Max concurrent fresh ClickHouse resolves (cache misses) per instance, bounding a distinct-filter stampede. Defaults to 16; 0 disables. */ resolveAdmissionLimit?: number; + /** Per-connection replay-cursor store. Inject a fleet-shared (Redis) store so an instance + * hop reads the connection's true inter-poll gap instead of cold-probing; defaults to a + * per-instance in-memory cache. */ + replayCursorStore?: ReplayCursorStore; /** Observability hook: why a live request woke (notify vs timeout vs abort). */ onWakeup?: (reason: WakeupReason) => void; /** Observability hook: how a live poll resolved (fast path vs full resolve). */ @@ -206,18 +211,21 @@ export class NativeRealtimeClient implements RealtimeStreamClient { /** Bounds concurrent fresh CH resolves (undefined => unbounded). */ readonly #admissionGate?: ResolveAdmissionGate; /** Per-connection: when this connection's last response was sent, so the router's - * replay covers exactly the inter-poll gap instead of rewinding a full window. */ - readonly #replayCursorCache: BoundedTtlCache; + * replay covers exactly the inter-poll gap instead of rewinding a full window. + * Fleet-shared when a store is injected (hops stop looking like unknown gaps). */ + readonly #replayCursors: ReplayCursorStore; constructor(private readonly options: NativeRealtimeClientOptions) { this.#workingSetCache = new BoundedTtlCache( options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES ); - this.#replayCursorCache = new BoundedTtlCache( - options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, - options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES - ); + this.#replayCursors = + options.replayCursorStore ?? + new InMemoryReplayCursorStore( + options.workingSetCacheTtlMs ?? LIST_CACHE_TTL_MS, + options.listCacheMaxEntries ?? LIST_CACHE_MAX_ENTRIES + ); this.#runSetCache = new BoundedTtlCache( options.runSetResolveCacheTtlMs ?? DEFAULT_RUNSET_CACHE_TTL_MS, options.runSetResolveCacheMaxEntries ?? DEFAULT_RUNSET_CACHE_MAX_ENTRIES @@ -528,7 +536,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { maxUpdatedAt = Math.max(maxUpdatedAt, updatedAtMs); } this.#workingSetCache.set(this.#workingSetKey(environment.id, handle), seen); - this.#replayCursorCache.set(this.#workingSetKey(environment.id, handle), Date.now()); + this.#replayCursors.set(this.#workingSetKey(environment.id, handle), Date.now()); return this.#buildResponse(buildRowsBody(changes, skipColumns), apiVersion, clientVersion, { offset: encodeOffset(maxUpdatedAt, this.#nextSeq()), @@ -559,7 +567,7 @@ export class NativeRealtimeClient implements RealtimeStreamClient { const workingSetKey = this.#workingSetKey(environment.id, handle); let prevSeen = this.#workingSetCache.get(workingSetKey); - const markPollEnd = () => this.#replayCursorCache.set(workingSetKey, Date.now()); + const markPollEnd = () => this.#replayCursors.set(workingSetKey, Date.now()); const emitFromSerialized = (changes: SerializedRowChange[], maxUpdatedAt: number): Response => { const seq = this.#nextSeq(); markPollEnd(); @@ -588,12 +596,14 @@ export class NativeRealtimeClient implements RealtimeStreamClient { }); }; + // When this connection last received data, so replay covers exactly its gap. A store + // error degrades to undefined (cold probe), never a failed poll. + const replaySinceMs = await this.#replayCursors.get(workingSetKey); const registration = this.options.router.register( environment.id, this.#feedFilter(filter), skipColumns, - // When this connection last received data, so replay covers exactly its gap. - { replaySinceMs: this.#replayCursorCache.get(workingSetKey) } + { replaySinceMs } ); // Cold start (fresh env subscription, e.g. an instance hop): resolve once up front diff --git a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts index 948d791f850..c41149f0cc6 100644 --- a/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts +++ b/apps/webapp/app/services/realtime/nativeRealtimeClientInstance.server.ts @@ -9,6 +9,7 @@ import { EnvChangeRouter } from "./envChangeRouter.server"; import { NativeRealtimeClient } from "./nativeRealtimeClient.server"; import { RealtimeConcurrencyLimiter } from "./realtimeConcurrencyLimiter.server"; import { getRunChangeNotifier } from "./runChangeNotifierInstance.server"; +import { RedisReplayCursorStore } from "./replayCursorStore.server"; import { RunHydrator } from "./runReader.server"; // Process-singleton wiring for the native realtime client; only constructed when a @@ -77,6 +78,11 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { description: "Polls rejected (429) by the per-env concurrency limiter.", }); + const replayCursorOps = meter.createCounter("realtime_native.replay_cursor_ops", { + description: + "Shared replay-cursor store operations by outcome. Errors degrade hops to cold resolves (watch live_polls{path='cold-resolve'} rise with them), never failed polls.", + }); + const limiter = new RealtimeConcurrencyLimiter({ keyPrefix: "tr:realtime:native:concurrency", redis: { @@ -89,6 +95,24 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { }, }); + // Fleet-shared replay cursors (one timestamp per connection) on the same Redis as the + // change channel, so a load-balancer hop reads the connection's true inter-poll gap. + const replayCursorStore = + env.REALTIME_BACKEND_NATIVE_SHARED_REPLAY_CURSORS === "1" + ? new RedisReplayCursorStore({ + redis: { + host: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_HOST, + port: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PORT, + username: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_USERNAME, + password: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_PASSWORD, + tlsDisabled: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_TLS_DISABLED === "true", + clusterMode: env.REALTIME_BACKEND_NATIVE_PUBSUB_REDIS_CLUSTER_MODE_ENABLED === "1", + }, + ttlMs: env.REALTIME_BACKEND_NATIVE_WORKING_SET_TTL_MS, + onResult: (op, ok) => replayCursorOps.add(1, { op, result: ok ? "ok" : "error" }), + }) + : undefined; + // One RunHydrator shared by the router and the client, so its single-flight + short-TTL cache covers both. const runReader = new RunHydrator({ replica: $replica, @@ -138,6 +162,7 @@ function initializeNativeRealtimeClient(): NativeRealtimeClient { runSetCreatedAtBucketMs: env.REALTIME_BACKEND_NATIVE_RUNSET_CREATED_AT_BUCKET_MS, holdOnEmpty: env.REALTIME_BACKEND_NATIVE_HOLD_ON_EMPTY === "1", resolveAdmissionLimit: env.REALTIME_BACKEND_NATIVE_RESOLVE_ADMISSION_LIMIT, + replayCursorStore, onWakeup: (reason) => wakeups.add(1, { reason }), onLivePollPath: (path) => livePollPaths.add(1, { path }), onRunSetResolve: (result) => runSetResolves.add(1, { result }), diff --git a/apps/webapp/app/services/realtime/replayCursorStore.server.ts b/apps/webapp/app/services/realtime/replayCursorStore.server.ts new file mode 100644 index 00000000000..597957704af --- /dev/null +++ b/apps/webapp/app/services/realtime/replayCursorStore.server.ts @@ -0,0 +1,145 @@ +import { createRedisClient, type RedisClient, type RedisWithClusterOptions } from "~/redis.server"; +import { logger } from "../logger.server"; +import { BoundedTtlCache } from "./boundedTtlCache"; + +/** + * Per-connection replay cursors ("when did this connection last receive data"), keyed by the + * env-prefixed working-set key. Sharing them fleet-wide makes an instance hop look like a normal + * inter-poll gap instead of an unknown one, so hops stop triggering cold resolves and full-window + * replays. Values are single timestamps, so the shared store stays cheap. + */ +export interface ReplayCursorStore { + /** The connection's last-response timestamp; undefined on miss OR error (the caller + * degrades to a cold probe / full-window replay, never blocks the poll). */ + get(key: string): Promise; + /** Fire-and-forget stamp; must never throw. */ + set(key: string, ms: number): void; +} + +/** Per-instance fallback with the same shape (used when the shared store is disabled, and in tests). */ +export class InMemoryReplayCursorStore implements ReplayCursorStore { + readonly #cache: BoundedTtlCache; + + constructor(ttlMs: number, maxEntries: number) { + this.#cache = new BoundedTtlCache(ttlMs, maxEntries); + } + + async get(key: string): Promise { + return this.#cache.get(key); + } + + set(key: string, ms: number): void { + this.#cache.set(key, ms); + } +} + +export type RedisReplayCursorStoreOptions = { + redis: RedisWithClusterOptions; + /** Entry TTL (ms); matches the working-set TTL so both views of a connection age out together. */ + ttlMs: number; + /** Read deadline (ms): a slow or down Redis degrades the poll to a cold probe instead of stalling it. */ + getTimeoutMs?: number; + keyPrefix?: string; + connectionName?: string; + /** Observability hook: a store op settled (errors are the degradation signal, not failures). */ + onResult?: (op: "get" | "set", ok: boolean) => void; +}; + +const DEFAULT_KEY_PREFIX = "realtime:replay-cursor:"; +const DEFAULT_GET_TIMEOUT_MS = 250; +const TIMED_OUT = Symbol("replay-cursor-get-timeout"); + +export class RedisReplayCursorStore implements ReplayCursorStore { + #client: RedisClient | undefined; + + constructor(private readonly options: RedisReplayCursorStoreOptions) {} + + async get(key: string): Promise { + try { + const raw = await this.#getWithDeadline(this.#key(key)); + if (raw === TIMED_OUT) { + this.options.onResult?.("get", false); + logger.warn("[replayCursorStore] replay-cursor read timed out", { key }); + return undefined; + } + this.options.onResult?.("get", true); + if (raw === null) { + return undefined; + } + const ms = Number(raw); + return Number.isFinite(ms) && ms > 0 ? ms : undefined; + } catch (error) { + this.options.onResult?.("get", false); + logger.error("[replayCursorStore] failed to read a replay cursor", { error, key }); + return undefined; + } + } + + /** GET raced against the read deadline (ioredis queues commands while disconnected, which + * would otherwise stall every poll start through an outage). */ + #getWithDeadline(key: string): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout( + () => resolve(TIMED_OUT), + this.options.getTimeoutMs ?? DEFAULT_GET_TIMEOUT_MS + ); + timer.unref?.(); + this.#ensureClient() + .get(key) + .then( + (value) => { + clearTimeout(timer); + resolve(value); + }, + (error) => { + clearTimeout(timer); + reject(error); + } + ); + }); + } + + set(key: string, ms: number): void { + try { + this.#ensureClient() + .set(this.#key(key), String(ms), "PX", this.options.ttlMs) + .then( + () => this.options.onResult?.("set", true), + (error) => { + this.options.onResult?.("set", false); + logger.error("[replayCursorStore] failed to write a replay cursor", { error, key }); + } + ); + } catch (error) { + this.options.onResult?.("set", false); + logger.error("[replayCursorStore] failed to write a replay cursor", { error, key }); + } + } + + async quit(): Promise { + const client = this.#client; + this.#client = undefined; + if (!client) return; + try { + // Bounded graceful QUIT; cursor writes are best-effort, so force-close beyond it. + await Promise.race([client.quit(), new Promise((resolve) => setTimeout(resolve, 500))]); + } catch { + // force-close below + } + client.disconnect(); + } + + #key(key: string): string { + return `${this.options.keyPrefix ?? DEFAULT_KEY_PREFIX}${key}`; + } + + #ensureClient(): RedisClient { + if (!this.#client) { + this.#client = createRedisClient( + this.options.connectionName ?? "trigger:realtime:replay-cursors", + this.options.redis + ); + } + return this.#client; + } +} diff --git a/apps/webapp/test/realtime/replayCursorStore.test.ts b/apps/webapp/test/realtime/replayCursorStore.test.ts new file mode 100644 index 00000000000..b66bc72df9c --- /dev/null +++ b/apps/webapp/test/realtime/replayCursorStore.test.ts @@ -0,0 +1,141 @@ +import { redisTest } from "@internal/testcontainers"; +import { setTimeout as sleep } from "node:timers/promises"; +import { CURRENT_API_VERSION } from "~/api/versions"; +import { EnvChangeRouter } from "~/services/realtime/envChangeRouter.server"; +import { + NativeRealtimeClient, + type RealtimeListEnvironment, +} from "~/services/realtime/nativeRealtimeClient.server"; +import { + InMemoryReplayCursorStore, + RedisReplayCursorStore, + type ReplayCursorStore, +} from "~/services/realtime/replayCursorStore.server"; +import { describe, expect, it, vi } from "vitest"; + +describe("InMemoryReplayCursorStore", () => { + it("round-trips and expires", async () => { + const store = new InMemoryReplayCursorStore(50, 10); + store.set("env_1:h1", 123_456); + expect(await store.get("env_1:h1")).toBe(123_456); + expect(await store.get("env_1:other")).toBeUndefined(); + await sleep(60); + expect(await store.get("env_1:h1")).toBeUndefined(); + }); +}); + +describe("RedisReplayCursorStore", () => { + redisTest("round-trips, misses, and expires via PX", async ({ redisOptions }) => { + const store = new RedisReplayCursorStore({ + redis: { ...redisOptions, tlsDisabled: true }, + ttlMs: 150, + }); + try { + const now = Date.now(); + store.set("env_1:h1", now); + await vi.waitFor(async () => expect(await store.get("env_1:h1")).toBe(now)); + expect(await store.get("env_1:missing")).toBeUndefined(); + await sleep(200); + expect(await store.get("env_1:h1")).toBeUndefined(); + } finally { + await store.quit(); + } + }); + + redisTest("a second store instance reads the first's cursor (fleet sharing)", async ({ + redisOptions, + }) => { + const a = new RedisReplayCursorStore({ + redis: { ...redisOptions, tlsDisabled: true }, + ttlMs: 60_000, + }); + const b = new RedisReplayCursorStore({ + redis: { ...redisOptions, tlsDisabled: true }, + ttlMs: 60_000, + }); + try { + a.set("env_1:h2", 42_000); + await vi.waitFor(async () => expect(await b.get("env_1:h2")).toBe(42_000)); + } finally { + await Promise.all([a.quit(), b.quit()]); + } + }); + + it("degrades to undefined within the read deadline when Redis is unreachable", async () => { + const results: Array<[string, boolean]> = []; + const store = new RedisReplayCursorStore({ + redis: { host: "127.0.0.1", port: 1, tlsDisabled: true } as any, + ttlMs: 1_000, + getTimeoutMs: 100, + onResult: (op, ok) => results.push([op, ok]), + }); + try { + expect(await store.get("env_1:h3")).toBeUndefined(); + expect(results).toContainEqual(["get", false]); + } finally { + await store.quit().catch(() => {}); + } + }); +}); + +describe("NativeRealtimeClient replay-cursor threading", () => { + const ENV: RealtimeListEnvironment = { id: "env_1", organizationId: "org_1", projectId: "proj_1" }; + const FLOOR_MS = Date.UTC(2026, 5, 7, 12, 0, 0); + + it("passes the stored cursor to register and stamps the store after responding", async () => { + const cursorMs = Date.now() - 500; + const gets: string[] = []; + const sets: Array<[string, number]> = []; + const store: ReplayCursorStore = { + get: async (key) => { + gets.push(key); + return cursorMs; + }, + set: (key, ms) => { + sets.push([key, ms]); + }, + }; + + const router = new EnvChangeRouter({ + source: { subscribeToEnv: () => () => {} }, + hydrator: { hydrateByIds: async () => [] }, + replayWindowMs: 0, + unsubscribeLingerMs: 0, + }); + const registerSpy = vi.spyOn(router, "register"); + + const client = new NativeRealtimeClient({ + runReader: { getRunById: async () => null, hydrateByIds: async () => [] } as any, + runListResolver: { resolveMatchingRunIds: async () => [] } as any, + router, + limiter: { incrementAndCheck: async () => true, decrement: async () => {} } as any, + cachedLimitProvider: { getCachedLimit: async () => 100 }, + maximumCreatedAtFilterAgeMs: 100 * 365 * 24 * 60 * 60 * 1000, + runSetResolveCacheTtlMs: 0, + livePollTimeoutMs: 30, + replayCursorStore: store, + }); + + const res = await client.streamRuns( + `http://localhost:3030/realtime/v1/runs?offset=${FLOOR_MS}_1&live=true&handle=runs_${FLOOR_MS}_7`, + ENV, + { tags: ["t"] }, + CURRENT_API_VERSION, + undefined, + "1.0.0" + ); + + expect(res.status).toBe(200); + expect(gets).toEqual([`env_1:runs_${FLOOR_MS}_7`]); + expect(registerSpy).toHaveBeenCalledWith( + "env_1", + expect.objectContaining({ kind: "tag" }), + expect.anything(), + { replaySinceMs: cursorMs } + ); + // The backstop's up-to-date response stamps the cursor for the next poll. + expect(sets.length).toBe(1); + expect(sets[0][0]).toBe(`env_1:runs_${FLOOR_MS}_7`); + expect(sets[0][1]).toBeGreaterThanOrEqual(cursorMs); + }); +});