also use the readOnlyPrisma client when reading in the retrying logic

ericallam · ericallam · commit a178c8dcca4f · 2026-01-23T11:01:57.000Z
diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts
@@ -899,7 +899,7 @@ export class RunAttemptSystem {
 
           const failedAt = new Date();
 
-          const retryResult = await retryOutcomeFromCompletion(prisma, {
+          const retryResult = await retryOutcomeFromCompletion(this.$.readOnlyPrisma, {
             runId,
             error: completion.error,
             retryUsingQueue: forceRequeue ?? false,
diff --git a/scripts/clean-orphaned-concurrency.ts b/scripts/clean-orphaned-concurrency.ts
@@ -0,0 +1,300 @@
+#!/usr/bin/env npx tsx
+
+/**
+ * Script to clean orphaned concurrency entries from Redis.
+ *
+ * These are run IDs that exist in currentConcurrency/currentDequeued sets
+ * but the corresponding TaskRun is not in DEQUEUED or EXECUTING status.
+ *
+ * Usage:
+ *   npx tsx scripts/clean-orphaned-concurrency.ts \
+ *     --org <orgId> \
+ *     --project <projectId> \
+ *     --env <envId> \
+ *     --read-redis <redisUrl> \
+ *     --write-redis <redisUrl> \
+ *     --pg <postgresUrl> \
+ *     [--dry-run] \
+ *     [--include-env] \
+ *     [--queues queue1,queue2]
+ *
+ * Options:
+ *   --include-env: Also clean the environment-level concurrency sets
+ *                  (not just queue-level sets)
+ */
+
+import { parseArgs } from "node:util";
+import Redis from "ioredis";
+import pg from "pg";
+
+// Statuses where a run SHOULD be in the concurrency set
+// WAITING_TO_RESUME is included because suspended runs may legitimately be in the set
+const VALID_CONCURRENCY_STATUSES = ["DEQUEUED", "EXECUTING", "WAITING_TO_RESUME"];
+
+interface OrphanedEntry {
+  key: string;
+  runId: string;
+  dbStatus: string | null;
+  keyType: "currentConcurrency" | "currentDequeued";
+  queueName: string;
+}
+
+async function main() {
+  const { values } = parseArgs({
+    options: {
+      org: { type: "string" },
+      project: { type: "string" },
+      env: { type: "string" },
+      "read-redis": { type: "string" },
+      "write-redis": { type: "string" },
+      pg: { type: "string" },
+      "dry-run": { type: "boolean", default: false },
+      "include-env": { type: "boolean", default: false }, // Also clean env-level concurrency sets
+      queues: { type: "string" }, // Comma-separated queue names to skip scan
+    },
+    allowPositionals: true, // Ignore extra positional args from shell escaping
+  });
+
+  const orgId = values.org;
+  const projectId = values.project;
+  const envId = values.env;
+  const readRedisUrl = values["read-redis"];
+  const writeRedisUrl = values["write-redis"];
+  const pgUrl = values.pg;
+  const dryRun = values["dry-run"] ?? false;
+  const includeEnv = values["include-env"] ?? false;
+  const queueNames = values.queues?.split(",").map((q) => q.trim()) ?? [];
+
+  if (!orgId || !projectId || !envId || !readRedisUrl || !writeRedisUrl || !pgUrl) {
+    console.error("Missing required arguments");
+    console.error(
+      "Usage: npx tsx scripts/clean-orphaned-concurrency.ts --org <orgId> --project <projectId> --env <envId> --read-redis <url> --write-redis <url> --pg <url> [--dry-run] [--queues queue1,queue2]"
+    );
+    process.exit(1);
+  }
+
+  console.log(`Mode: ${dryRun ? "DRY RUN" : "LIVE"}`);
+  console.log(`Org: ${orgId}`);
+  console.log(`Project: ${projectId}`);
+  console.log(`Environment: ${envId}`);
+  console.log("");
+
+  // Connect to Redis (read)
+  const readRedis = new Redis(readRedisUrl, {
+    lazyConnect: true,
+    tls: { rejectUnauthorized: false },
+  });
+  await readRedis.connect();
+  console.log("Connected to read Redis");
+
+  // Connect to Redis (write) - only if not dry run
+  let writeRedis: Redis | null = null;
+  if (!dryRun) {
+    writeRedis = new Redis(writeRedisUrl, {
+      lazyConnect: true,
+      tls: { rejectUnauthorized: false },
+    });
+    await writeRedis.connect();
+    console.log("Connected to write Redis");
+  }
+
+  // Connect to PostgreSQL
+  const pgClient = new pg.Client({ connectionString: pgUrl });
+  await pgClient.connect();
+  console.log("Connected to PostgreSQL");
+  console.log("");
+
+  try {
+    let keys: string[] = [];
+    const envKeyPrefix = `engine:runqueue:{org:${orgId}}:proj:${projectId}:env:${envId}`;
+    const queueKeyPrefix = `${envKeyPrefix}:queue:`;
+
+    if (queueNames.length > 0) {
+      // Construct keys directly from provided queue names
+      console.log(`Using provided queue names: ${queueNames.join(", ")}`);
+      for (const queueName of queueNames) {
+        keys.push(`${queueKeyPrefix}${queueName}:currentConcurrency`);
+        keys.push(`${queueKeyPrefix}${queueName}:currentDequeued`);
+      }
+      console.log(`Constructed ${keys.length} queue-level keys to check`);
+    }
+
+    if (includeEnv) {
+      // Add environment-level concurrency keys
+      keys.push(`${envKeyPrefix}:currentConcurrency`);
+      keys.push(`${envKeyPrefix}:currentDequeued`);
+      console.log(`Also checking environment-level concurrency keys`);
+    }
+
+    if (keys.length === 0) {
+      console.error("ERROR: --queues flag or --include-env flag is required.");
+      console.error("");
+      console.error("To find queue names, run:");
+      console.error(`  redis-cli --tls -u "<read-redis-url>" KEYS 'engine:runqueue:{org:${orgId}}:proj:${projectId}:env:${envId}:queue:*:currentConcurrency'`);
+      console.error("");
+      console.error("Then extract the queue names and pass them with --queues 'queue1,queue2,...'");
+      console.error("");
+      console.error("Or use --include-env to just clean the environment-level concurrency sets.");
+      process.exit(1);
+    }
+    console.log("");
+
+    // Filter to only currentConcurrency and currentDequeued keys
+    const concurrencyKeys = keys.filter(
+      (k) => k.endsWith(":currentConcurrency") || k.endsWith(":currentDequeued")
+    );
+
+    console.log(`Processing ${concurrencyKeys.length} concurrency/dequeued keys`);
+    console.log("");
+
+    // Collect all run IDs from all keys
+    const runIdToKeys = new Map<string, string[]>();
+
+    for (const key of concurrencyKeys) {
+      const members = await readRedis.smembers(key);
+      for (const runId of members) {
+        const existing = runIdToKeys.get(runId) || [];
+        existing.push(key);
+        runIdToKeys.set(runId, existing);
+      }
+    }
+
+    const allRunIds = Array.from(runIdToKeys.keys());
+    console.log(`Found ${allRunIds.length} unique run IDs across all concurrency sets`);
+    console.log("");
+
+    if (allRunIds.length === 0) {
+      console.log("No run IDs found in concurrency sets. Nothing to clean.");
+      return;
+    }
+
+    // Query database for run statuses in batches
+    const batchSize = 500;
+    const runStatuses = new Map<string, string | null>();
+
+    for (let i = 0; i < allRunIds.length; i += batchSize) {
+      const batch = allRunIds.slice(i, i + batchSize);
+      const placeholders = batch.map((_, idx) => `$${idx + 1}`).join(", ");
+      const query = `SELECT id, status FROM "TaskRun" WHERE id IN (${placeholders})`;
+
+      const result = await pgClient.query(query, batch);
+
+      for (const row of result.rows) {
+        runStatuses.set(row.id, row.status);
+      }
+
+      // Mark missing runs as null
+      for (const runId of batch) {
+        if (!runStatuses.has(runId)) {
+          runStatuses.set(runId, null);
+        }
+      }
+    }
+
+    console.log(`Retrieved statuses for ${runStatuses.size} runs`);
+    console.log("");
+
+    // Find orphaned entries
+    const orphanedEntries: OrphanedEntry[] = [];
+
+    for (const [runId, keys] of runIdToKeys) {
+      const status = runStatuses.get(runId);
+
+      if (status === null || !VALID_CONCURRENCY_STATUSES.includes(status)) {
+        for (const key of keys) {
+          const keyType = key.endsWith(":currentConcurrency")
+            ? "currentConcurrency"
+            : "currentDequeued";
+          const queueMatch = key.match(/:queue:([^:]+):/);
+          // Use "[ENV]" for environment-level keys (no :queue: segment)
+          const queueName = queueMatch ? queueMatch[1] : "[ENV]";
+
+          orphanedEntries.push({
+            key,
+            runId,
+            dbStatus: status,
+            keyType,
+            queueName,
+          });
+        }
+      }
+    }
+
+    if (orphanedEntries.length === 0) {
+      console.log("No orphaned entries found. All concurrency sets are clean.");
+      return;
+    }
+
+    // Group by queue for reporting
+    const byQueue = new Map<string, OrphanedEntry[]>();
+    for (const entry of orphanedEntries) {
+      const existing = byQueue.get(entry.queueName) || [];
+      existing.push(entry);
+      byQueue.set(entry.queueName, existing);
+    }
+
+    console.log(`Found ${orphanedEntries.length} orphaned entries across ${byQueue.size} queues:`);
+    console.log("");
+
+    for (const [queueName, entries] of byQueue) {
+      console.log(`Queue: ${queueName}`);
+
+      const concurrencyEntries = entries.filter((e) => e.keyType === "currentConcurrency");
+      const dequeuedEntries = entries.filter((e) => e.keyType === "currentDequeued");
+
+      if (concurrencyEntries.length > 0) {
+        console.log(`  currentConcurrency (${concurrencyEntries.length}):`);
+        for (const entry of concurrencyEntries) {
+          console.log(`    - ${entry.runId} (DB status: ${entry.dbStatus ?? "NOT FOUND"})`);
+        }
+      }
+
+      if (dequeuedEntries.length > 0) {
+        console.log(`  currentDequeued (${dequeuedEntries.length}):`);
+        for (const entry of dequeuedEntries) {
+          console.log(`    - ${entry.runId} (DB status: ${entry.dbStatus ?? "NOT FOUND"})`);
+        }
+      }
+
+      console.log("");
+    }
+
+    if (dryRun) {
+      console.log("DRY RUN: No changes made. Run without --dry-run to clean these entries.");
+      return;
+    }
+
+    // Clean orphaned entries
+    console.log("Cleaning orphaned entries...");
+    console.log("");
+
+    // Group by key for efficient SREM
+    const removalsByKey = new Map<string, string[]>();
+    for (const entry of orphanedEntries) {
+      const existing = removalsByKey.get(entry.key) || [];
+      existing.push(entry.runId);
+      removalsByKey.set(entry.key, existing);
+    }
+
+    let totalRemoved = 0;
+    for (const [key, runIds] of removalsByKey) {
+      const removed = await writeRedis!.srem(key, runIds);
+      console.log(`Removed ${removed} entries from ${key}`);
+      totalRemoved += removed;
+    }
+
+    console.log("");
+    console.log(`Total removed: ${totalRemoved} entries`);
+  } finally {
+    await readRedis.quit();
+    if (writeRedis) {
+      await writeRedis.quit();
+    }
+    await pgClient.end();
+  }
+}
+
+main().catch((err) => {
+  console.error("Fatal error:", err);
+  process.exit(1);
+});