From a45b8af251dfd1eb6c7fcc4bacee84cd23424756 Mon Sep 17 00:00:00 2001
From: James <james.russo@heygen.com>
Date: Tue, 16 Jun 2026 01:14:03 +0000
Subject: [PATCH 1/2] feat(cli): support OpenRouter as an alternative vision
 provider for capture captioning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`hyperframes capture` could only enrich asset descriptions with Gemini vision,
which requires a Google API key. Add OpenRouter as an alternative so users
without Google access can caption via any vision-capable model through one
unified key.

Provider is selected by which key is present: OPENROUTER_API_KEY → OpenRouter
(OpenAI-style /chat/completions with an image_url data URI), else
GEMINI_API_KEY/GOOGLE_API_KEY → Gemini (unchanged), else DOM-only as before.
OpenRouter wins if both are set. Default model is google/gemini-3.1-flash-lite
(the OpenRouter analog of the Gemini path's existing 3.1-flash-lite tier),
overridable via HYPERFRAMES_OPENROUTER_MODEL.

Both vision call sites — the image loop and the rasterized-SVG loop — route
through a single `captionOne` dispatcher, so the new provider works for SVGs too
(the original PR #840 only patched the image loop, which would have left
OpenRouter-only users with crashing SVG captioning). The OpenRouter path checks
res.ok and surfaces the status/body on failure.

Reimplements #840 (which was unmergeable: saved with a UTF-8 BOM + CRLF so
GitHub rendered it as a binary diff, used `any`, reused the Gemini model env
var, and had a hallucinated default model id).

- Adds unit tests for the OpenRouter path (happy path, graceful degradation on
  non-OK status, no-key skip).
- Documents OPENROUTER_API_KEY in the website-to-video guide and the CLI capture
  reference.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/guides/website-to-video.mdx              |  12 +-
 docs/packages/cli.mdx                         |   2 +-
 .../cli/src/capture/contentExtractor.test.ts  |  99 +++++++++++
 packages/cli/src/capture/contentExtractor.ts  | 158 ++++++++++++------
 4 files changed, 216 insertions(+), 55 deletions(-)
 create mode 100644 packages/cli/src/capture/contentExtractor.test.ts
diff --git a/docs/guides/website-to-video.mdx b/docs/guides/website-to-video.mdx
index 464b86905d..47f2d734f6 100644
--- a/docs/guides/website-to-video.mdx
+++ b/docs/guides/website-to-video.mdx
@@ -93,14 +93,22 @@ The prompt determines the format. Include a duration and creative direction:
 
 ## Enriching Captures with Gemini Vision
 
-By default, captures describe assets using DOM context — alt text, nearby headings, CSS classes. Add a [Gemini API key](https://aistudio.google.com/apikey) for richer AI-powered descriptions using vision.
+By default, captures describe assets using DOM context — alt text, nearby headings, CSS classes. Add a vision API key for richer AI-powered descriptions.
 
-Create a `.env` file in your project root:
+Create a `.env` file in your project root with **either** a [Gemini API key](https://aistudio.google.com/apikey):
 
 ```bash
 echo "GEMINI_API_KEY=your-key-here" > .env
 ```
 
+…**or**, if you don't have Google access, an [OpenRouter key](https://openrouter.ai/keys) — a single API that fronts many vision models:
+
+```bash
+echo "OPENROUTER_API_KEY=your-key-here" > .env
+```
+
+OpenRouter is used when its key is present (it takes priority if both are set). The default model is `google/gemini-3.1-flash-lite`; override it with `HYPERFRAMES_OPENROUTER_MODEL` (any vision-capable OpenRouter model), just as `HYPERFRAMES_GEMINI_MODEL` overrides the Gemini default.
+
 <Tabs>
   <Tab title="Without Gemini">
     ```
diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx
index a725e09deb..dc278dc29e 100644
--- a/docs/packages/cli.mdx
+++ b/docs/packages/cli.mdx
@@ -432,7 +432,7 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_
 
     Output is a self-contained directory with a `CLAUDE.md` file that any AI agent can read to understand the captured site. Used by the `/website-to-video` skill as step 1 of the video production pipeline.
 
-    Set `GEMINI_API_KEY` in a `.env` file for AI-powered image descriptions via Gemini vision (~$0.001/image). See the [Website to Video](/guides/website-to-video#enriching-captures-with-gemini-vision) guide for details.
+    Set `GEMINI_API_KEY` in a `.env` file for AI-powered image descriptions via Gemini vision (~$0.001/image), or set `OPENROUTER_API_KEY` to use any vision model through [OpenRouter](https://openrouter.ai) instead (takes priority if both are set; override the model with `HYPERFRAMES_OPENROUTER_MODEL`). See the [Website to Video](/guides/website-to-video#enriching-captures-with-gemini-vision) guide for details.
 
   </Tab>
   <Tab title="Preview">
diff --git a/packages/cli/src/capture/contentExtractor.test.ts b/packages/cli/src/capture/contentExtractor.test.ts
new file mode 100644
index 0000000000..44698a6893
--- /dev/null
+++ b/packages/cli/src/capture/contentExtractor.test.ts
@@ -0,0 +1,99 @@
+import { afterEach, describe, expect, it, vi } from "vitest";
+import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { captionImagesWithGemini } from "./contentExtractor.js";
+
+// These tests exercise the OpenRouter provider path only — it makes a plain
+// `fetch` call we can stub, with no native (`sharp`) or `@google/genai`
+// dependency. OpenRouter wins over Gemini when OPENROUTER_API_KEY is set, so we
+// don't need to clear the Gemini keys for the OpenRouter cases.
+
+function makeProjectWithImage(): string {
+  const dir = mkdtempSync(join(tmpdir(), "hf-caption-"));
+  mkdirSync(join(dir, "assets"), { recursive: true });
+  // Contents are irrelevant to the OpenRouter path (it just base64-encodes the
+  // bytes); only the .png extension matters for the image filter.
+  writeFileSync(join(dir, "assets", "hero.png"), Buffer.from([0x89, 0x50, 0x4e, 0x47]));
+  return dir;
+}
+
+describe("captionImagesWithGemini — OpenRouter provider", () => {
+  const dirs: string[] = [];
+  afterEach(() => {
+    vi.unstubAllGlobals();
+    vi.unstubAllEnvs();
+    for (const d of dirs) rmSync(d, { recursive: true, force: true });
+    dirs.length = 0;
+  });
+
+  it("captions via OpenRouter when OPENROUTER_API_KEY is set", async () => {
+    const dir = makeProjectWithImage();
+    dirs.push(dir);
+    vi.stubEnv("OPENROUTER_API_KEY", "or-test-key");
+    vi.stubEnv("HYPERFRAMES_OPENROUTER_MODEL", "google/gemini-3.1-flash-lite");
+
+    const fetchMock = vi.fn(
+      async () =>
+        new Response(
+          JSON.stringify({ choices: [{ message: { content: "A dark hero with blue accents." } }] }),
+          { status: 200, headers: { "content-type": "application/json" } },
+        ),
+    );
+    vi.stubGlobal("fetch", fetchMock);
+
+    const warnings: string[] = [];
+    const captions = await captionImagesWithGemini(dir, () => {}, warnings);
+
+    expect(captions).toEqual({ "hero.png": "A dark hero with blue accents." });
+    expect(warnings).toEqual([]);
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+
+    const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit];
+    expect(url).toBe("https://openrouter.ai/api/v1/chat/completions");
+    expect((init.headers as Record<string, string>).Authorization).toBe("Bearer or-test-key");
+    const body = JSON.parse(init.body as string);
+    expect(body.model).toBe("google/gemini-3.1-flash-lite");
+    const parts = body.messages[0].content as Array<{ type: string; image_url?: { url: string } }>;
+    const image = parts.find((p) => p.type === "image_url");
+    expect(image?.image_url?.url).toMatch(/^data:image\/png;base64,/);
+  });
+
+  it("degrades gracefully (no throw, no captions) when OpenRouter returns a non-OK status", async () => {
+    const dir = makeProjectWithImage();
+    dirs.push(dir);
+    vi.stubEnv("OPENROUTER_API_KEY", "or-bad-key");
+
+    vi.stubGlobal(
+      "fetch",
+      vi.fn(
+        async () => new Response("invalid api key", { status: 401, statusText: "Unauthorized" }),
+      ),
+    );
+
+    const warnings: string[] = [];
+    // captionOne throws on !res.ok, but the throw is per-image inside
+    // Promise.allSettled, so it's filtered out as a rejected result rather than
+    // bubbling up — same silent degradation as the existing Gemini path.
+    const captions = await captionImagesWithGemini(dir, () => {}, warnings);
+
+    expect(captions).toEqual({});
+  });
+
+  it("skips captioning entirely when no provider key is present", async () => {
+    const dir = makeProjectWithImage();
+    dirs.push(dir);
+    vi.stubEnv("OPENROUTER_API_KEY", "");
+    vi.stubEnv("GEMINI_API_KEY", "");
+    vi.stubEnv("GOOGLE_API_KEY", "");
+
+    const fetchMock = vi.fn();
+    vi.stubGlobal("fetch", fetchMock);
+
+    const warnings: string[] = [];
+    const captions = await captionImagesWithGemini(dir, () => {}, warnings);
+
+    expect(captions).toEqual({});
+    expect(fetchMock).not.toHaveBeenCalled();
+  });
+});
diff --git a/packages/cli/src/capture/contentExtractor.ts b/packages/cli/src/capture/contentExtractor.ts
index 8fdd1681e1..29009fd827 100644
--- a/packages/cli/src/capture/contentExtractor.ts
+++ b/packages/cli/src/capture/contentExtractor.ts
@@ -1,7 +1,7 @@
 /**
  * Content extraction helpers for the website capture pipeline.
  *
- * Handles library detection, visible text extraction, Gemini captioning,
+ * Handles library detection, visible text extraction, vision captioning,
  * and asset description generation.
  *
  * All page.evaluate() calls use string expressions to avoid
@@ -156,7 +156,11 @@ export async function extractVisibleText(page: Page): Promise<string> {
 }
 
 /**
- * Caption downloaded images using Gemini vision API.
+ * Caption downloaded images using a vision model.
+ *
+ * Provider is chosen by which API key is present: OPENROUTER_API_KEY → OpenRouter
+ * (any vision model via its OpenAI-style API), else GEMINI_API_KEY/GOOGLE_API_KEY
+ * → Google Gemini, else no captioning. OpenRouter wins if both are set.
  *
  * Batches requests to stay under free-tier rate limits.
  * Returns a map of filename -> caption string.
@@ -167,26 +171,90 @@ export async function captionImagesWithGemini(
   warnings: string[],
 ): Promise<Record<string, string>> {
   const geminiCaptions: Record<string, string> = {};
+  const openRouterKey = process.env.OPENROUTER_API_KEY;
   const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
-  if (!geminiKey) return geminiCaptions;
+  if (!openRouterKey && !geminiKey) return geminiCaptions;
+
+  // OpenRouter takes priority when both keys are set — it's the explicit opt-in
+  // for users without Google access. Both providers satisfy the same
+  // single-image → one-line-caption contract (`captionOne`), so the batching and
+  // SVG-rasterization loops below stay provider-agnostic.
+  const useOpenRouter = Boolean(openRouterKey);
+  const providerName = useOpenRouter ? "OpenRouter" : "Gemini";
+  // Default mirrors the Gemini path's tier (3.x flash-lite). Override per
+  // provider via HYPERFRAMES_OPENROUTER_MODEL / HYPERFRAMES_GEMINI_MODEL.
+  const model = useOpenRouter
+    ? process.env.HYPERFRAMES_OPENROUTER_MODEL || "google/gemini-3.1-flash-lite"
+    : process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview";
 
-  progress("design", "Captioning images with Gemini vision...");
+  progress("design", `Captioning images with ${providerName} vision...`);
   try {
-    const { GoogleGenAI } = await import("@google/genai");
-    const ai = new GoogleGenAI({ apiKey: geminiKey });
+    // One image → one short caption. Each provider implements this contract;
+    // everything below is provider-agnostic.
+    type CaptionOne = (args: {
+      mimeType: string;
+      base64: string;
+      prompt: string;
+      maxTokens: number;
+    }) => Promise<string>;
+
+    let captionOne: CaptionOne;
+    if (openRouterKey) {
+      captionOne = async ({ mimeType, base64, prompt, maxTokens }) => {
+        const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
+          method: "POST",
+          headers: {
+            Authorization: `Bearer ${openRouterKey}`,
+            "Content-Type": "application/json",
+          },
+          body: JSON.stringify({
+            model,
+            messages: [
+              {
+                role: "user",
+                content: [
+                  { type: "text", text: prompt },
+                  { type: "image_url", image_url: { url: `data:${mimeType};base64,${base64}` } },
+                ],
+              },
+            ],
+            max_tokens: maxTokens,
+          }),
+        });
+        if (!res.ok) {
+          const detail = await res.text().catch(() => "");
+          throw new Error(`OpenRouter ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`);
+        }
+        const data = (await res.json()) as {
+          choices?: Array<{ message?: { content?: string } }>;
+        };
+        return data.choices?.[0]?.message?.content?.trim() || "";
+      };
+    } else {
+      // Unreachable when geminiKey is unset (guarded above); re-narrow for TS.
+      if (!geminiKey) return geminiCaptions;
+      const { GoogleGenAI } = await import("@google/genai");
+      const ai = new GoogleGenAI({ apiKey: geminiKey });
+      captionOne = async ({ mimeType, base64, prompt, maxTokens }) => {
+        const response = await ai.models.generateContent({
+          model,
+          contents: [
+            { role: "user", parts: [{ inlineData: { mimeType, data: base64 } }, { text: prompt }] },
+          ],
+          config: { maxOutputTokens: maxTokens },
+        });
+        return response.text?.trim() || "";
+      };
+    }
+
     const imageFiles = readdirSync(join(outputDir, "assets")).filter((f: string) =>
       /\.(png|jpg|jpeg|webp|gif)$/i.test(f),
     );
 
-    // Caption in parallel batches via Gemini vision API.
-    // Free tier: 5 RPM → batch 5, 12s pause (~$0 but slow)
-    // Paid tier: 2000 RPM → batch 20, 1s pause (~$0.001/image, fast)
-    // We try a larger batch first; if rate-limited, fall back to smaller batches.
-    // Default is a preview model — update when GA ships.
-    // Benchmark (49 images, paid tier): 3.1-flash-lite-preview ~507ms/img 131ch avg,
-    // 2.5-flash-lite ~230ms/img 117ch avg. Preview has richer captions but higher variance.
-    // Override: HYPERFRAMES_GEMINI_MODEL=gemini-2.5-flash-lite
-    const model = process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview";
+    // Caption in parallel batches. Gemini free tier is ~5 RPM (slow but $0),
+    // paid/OpenRouter ~2000 RPM. We batch 20 with a 2s inter-batch pause and rely
+    // on Promise.allSettled so a rate-limited image degrades to "" rather than
+    // failing the batch.
     const BATCH_SIZE = 20;
     for (let i = 0; i < imageFiles.length; i += BATCH_SIZE) {
       const batch = imageFiles.slice(i, i + BATCH_SIZE);
@@ -194,27 +262,19 @@ export async function captionImagesWithGemini(
         batch.map(async (file: string) => {
           const filePath = join(outputDir, "assets", file);
           const stat = statSync(filePath);
-          if (stat.size > 4_000_000) return { file, caption: "" }; // skip images > 4 MB (Gemini inline limit)
+          if (stat.size > 4_000_000) return { file, caption: "" }; // skip images > 4 MB (provider inline limit)
           const buffer = readFileSync(filePath);
           const base64 = buffer.toString("base64");
           const ext = file.split(".").pop()?.toLowerCase() || "png";
           const mimeType = ext === "jpg" ? "image/jpeg" : `image/${ext}`;
-          const response = await ai.models.generateContent({
-            model,
-            contents: [
-              {
-                role: "user",
-                parts: [
-                  { inlineData: { mimeType, data: base64 } },
-                  {
-                    text: "Describe this website image in ONE short sentence for a video storyboard. Focus on: what it shows, dominant colors, whether background is light or dark. Be factual, not creative.",
-                  },
-                ],
-              },
-            ],
-            config: { maxOutputTokens: 500 },
+          const caption = await captionOne({
+            mimeType,
+            base64,
+            prompt:
+              "Describe this website image in ONE short sentence for a video storyboard. Focus on: what it shows, dominant colors, whether background is light or dark. Be factual, not creative.",
+            maxTokens: 500,
           });
-          return { file, caption: response.text?.trim() || "" };
+          return { file, caption };
         }),
       );
       for (const result of results) {
@@ -231,7 +291,10 @@ export async function captionImagesWithGemini(
         `Captioned ${Math.min(i + BATCH_SIZE, imageFiles.length)}/${imageFiles.length} images...`,
       );
     }
-    progress("design", `${Object.keys(geminiCaptions).length} images captioned with Gemini`);
+    progress(
+      "design",
+      `${Object.keys(geminiCaptions).length} images captioned with ${providerName}`,
+    );
 
     // Rasterize SVGs to PNG before captioning — Vision hallucinates wordmarks when reading SVG path text.
     const svgFiles: Array<{ file: string; relPath: string }> = [];
@@ -301,26 +364,17 @@ export async function captionImagesWithGemini(
               svgsSkipped++;
               return { file: relPath, caption: "" };
             }
-            const response = await ai.models.generateContent({
-              model,
-              contents: [
-                {
-                  role: "user",
-                  parts: [
-                    { inlineData: { mimeType: "image/png", data: pngBase64 } },
-                    {
-                      text:
-                        "Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " +
-                        "Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " +
-                        "If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " +
-                        "Be factual.",
-                    },
-                  ],
-                },
-              ],
-              config: { maxOutputTokens: 300 },
+            const caption = await captionOne({
+              mimeType: "image/png",
+              base64: pngBase64,
+              prompt:
+                "Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " +
+                "Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " +
+                "If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " +
+                "Be factual.",
+              maxTokens: 300,
             });
-            return { file: relPath, caption: response.text?.trim() || "" };
+            return { file: relPath, caption };
           }),
         );
         for (const result of results) {
@@ -345,7 +399,7 @@ export async function captionImagesWithGemini(
       }
     }
   } catch (err) {
-    warnings.push(`Gemini captioning failed: ${err}`);
+    warnings.push(`${providerName} captioning failed: ${err}`);
   }
 
   return geminiCaptions;

From 3ba9b03096245427a7cbd4030f383db18778ec55 Mon Sep 17 00:00:00 2001
From: James <james.russo@heygen.com>
Date: Tue, 16 Jun 2026 02:48:32 +0000
Subject: [PATCH 2/2] =?UTF-8?q?test(cli):=20fix=20typecheck=20in=20OpenRou?=
 =?UTF-8?q?ter=20caption=20test=20=E2=80=94=20capture=20request=20without?=
 =?UTF-8?q?=20`as`?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test cast `fetchMock.mock.calls[0]` to a tuple (TS2352: `[] | undefined`
doesn't overlap `[string, RequestInit]`), which failed the Typecheck CI job.
Capture the url/init inside the typed mock and assert via `new Headers()` +
`typeof` narrowing instead — no `as` assertions (which the repo bans anyway).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../cli/src/capture/contentExtractor.test.ts  | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/packages/cli/src/capture/contentExtractor.test.ts b/packages/cli/src/capture/contentExtractor.test.ts
index 44698a6893..9683653848 100644
--- a/packages/cli/src/capture/contentExtractor.test.ts
+++ b/packages/cli/src/capture/contentExtractor.test.ts
@@ -33,13 +33,18 @@ describe("captionImagesWithGemini — OpenRouter provider", () => {
     vi.stubEnv("OPENROUTER_API_KEY", "or-test-key");
     vi.stubEnv("HYPERFRAMES_OPENROUTER_MODEL", "google/gemini-3.1-flash-lite");
 
-    const fetchMock = vi.fn(
-      async () =>
-        new Response(
-          JSON.stringify({ choices: [{ message: { content: "A dark hero with blue accents." } }] }),
-          { status: 200, headers: { "content-type": "application/json" } },
-        ),
-    );
+    // Capture the request inside the mock, where the args are well-typed —
+    // avoids casting `mock.calls` (and the repo's ban on `as` assertions).
+    let capturedUrl: string | undefined;
+    let capturedInit: RequestInit | undefined;
+    const fetchMock = vi.fn(async (url: string, init?: RequestInit) => {
+      capturedUrl = url;
+      capturedInit = init;
+      return new Response(
+        JSON.stringify({ choices: [{ message: { content: "A dark hero with blue accents." } }] }),
+        { status: 200, headers: { "content-type": "application/json" } },
+      );
+    });
     vi.stubGlobal("fetch", fetchMock);
 
     const warnings: string[] = [];
@@ -49,13 +54,11 @@ describe("captionImagesWithGemini — OpenRouter provider", () => {
     expect(warnings).toEqual([]);
     expect(fetchMock).toHaveBeenCalledTimes(1);
 
-    const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit];
-    expect(url).toBe("https://openrouter.ai/api/v1/chat/completions");
-    expect((init.headers as Record<string, string>).Authorization).toBe("Bearer or-test-key");
-    const body = JSON.parse(init.body as string);
+    expect(capturedUrl).toBe("https://openrouter.ai/api/v1/chat/completions");
+    expect(new Headers(capturedInit?.headers).get("authorization")).toBe("Bearer or-test-key");
+    const body = JSON.parse(typeof capturedInit?.body === "string" ? capturedInit.body : "{}");
     expect(body.model).toBe("google/gemini-3.1-flash-lite");
-    const parts = body.messages[0].content as Array<{ type: string; image_url?: { url: string } }>;
-    const image = parts.find((p) => p.type === "image_url");
+    const image = body.messages[0].content.find((p: { type: string }) => p.type === "image_url");
     expect(image?.image_url?.url).toMatch(/^data:image\/png;base64,/);
   });