From a45b8af251dfd1eb6c7fcc4bacee84cd23424756 Mon Sep 17 00:00:00 2001 From: James Date: Tue, 16 Jun 2026 01:14:03 +0000 Subject: [PATCH 1/2] feat(cli): support OpenRouter as an alternative vision provider for capture captioning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `hyperframes capture` could only enrich asset descriptions with Gemini vision, which requires a Google API key. Add OpenRouter as an alternative so users without Google access can caption via any vision-capable model through one unified key. Provider is selected by which key is present: OPENROUTER_API_KEY → OpenRouter (OpenAI-style /chat/completions with an image_url data URI), else GEMINI_API_KEY/GOOGLE_API_KEY → Gemini (unchanged), else DOM-only as before. OpenRouter wins if both are set. Default model is google/gemini-3.1-flash-lite (the OpenRouter analog of the Gemini path's existing 3.1-flash-lite tier), overridable via HYPERFRAMES_OPENROUTER_MODEL. Both vision call sites — the image loop and the rasterized-SVG loop — route through a single `captionOne` dispatcher, so the new provider works for SVGs too (the original PR #840 only patched the image loop, which would have left OpenRouter-only users with crashing SVG captioning). The OpenRouter path checks res.ok and surfaces the status/body on failure. Reimplements #840 (which was unmergeable: saved with a UTF-8 BOM + CRLF so GitHub rendered it as a binary diff, used `any`, reused the Gemini model env var, and had a hallucinated default model id). - Adds unit tests for the OpenRouter path (happy path, graceful degradation on non-OK status, no-key skip). - Documents OPENROUTER_API_KEY in the website-to-video guide and the CLI capture reference. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/guides/website-to-video.mdx | 12 +- docs/packages/cli.mdx | 2 +- .../cli/src/capture/contentExtractor.test.ts | 99 +++++++++++ packages/cli/src/capture/contentExtractor.ts | 158 ++++++++++++------ 4 files changed, 216 insertions(+), 55 deletions(-) create mode 100644 packages/cli/src/capture/contentExtractor.test.ts diff --git a/docs/guides/website-to-video.mdx b/docs/guides/website-to-video.mdx index 464b86905d..47f2d734f6 100644 --- a/docs/guides/website-to-video.mdx +++ b/docs/guides/website-to-video.mdx @@ -93,14 +93,22 @@ The prompt determines the format. Include a duration and creative direction: ## Enriching Captures with Gemini Vision -By default, captures describe assets using DOM context — alt text, nearby headings, CSS classes. Add a [Gemini API key](https://aistudio.google.com/apikey) for richer AI-powered descriptions using vision. +By default, captures describe assets using DOM context — alt text, nearby headings, CSS classes. Add a vision API key for richer AI-powered descriptions. -Create a `.env` file in your project root: +Create a `.env` file in your project root with **either** a [Gemini API key](https://aistudio.google.com/apikey): ```bash echo "GEMINI_API_KEY=your-key-here" > .env ``` +…**or**, if you don't have Google access, an [OpenRouter key](https://openrouter.ai/keys) — a single API that fronts many vision models: + +```bash +echo "OPENROUTER_API_KEY=your-key-here" > .env +``` + +OpenRouter is used when its key is present (it takes priority if both are set). The default model is `google/gemini-3.1-flash-lite`; override it with `HYPERFRAMES_OPENROUTER_MODEL` (any vision-capable OpenRouter model), just as `HYPERFRAMES_GEMINI_MODEL` overrides the Gemini default. + ``` diff --git a/docs/packages/cli.mdx b/docs/packages/cli.mdx index a725e09deb..dc278dc29e 100644 --- a/docs/packages/cli.mdx +++ b/docs/packages/cli.mdx @@ -432,7 +432,7 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_ Output is a self-contained directory with a `CLAUDE.md` file that any AI agent can read to understand the captured site. Used by the `/website-to-video` skill as step 1 of the video production pipeline. - Set `GEMINI_API_KEY` in a `.env` file for AI-powered image descriptions via Gemini vision (~$0.001/image). See the [Website to Video](/guides/website-to-video#enriching-captures-with-gemini-vision) guide for details. + Set `GEMINI_API_KEY` in a `.env` file for AI-powered image descriptions via Gemini vision (~$0.001/image), or set `OPENROUTER_API_KEY` to use any vision model through [OpenRouter](https://openrouter.ai) instead (takes priority if both are set; override the model with `HYPERFRAMES_OPENROUTER_MODEL`). See the [Website to Video](/guides/website-to-video#enriching-captures-with-gemini-vision) guide for details. diff --git a/packages/cli/src/capture/contentExtractor.test.ts b/packages/cli/src/capture/contentExtractor.test.ts new file mode 100644 index 0000000000..44698a6893 --- /dev/null +++ b/packages/cli/src/capture/contentExtractor.test.ts @@ -0,0 +1,99 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { captionImagesWithGemini } from "./contentExtractor.js"; + +// These tests exercise the OpenRouter provider path only — it makes a plain +// `fetch` call we can stub, with no native (`sharp`) or `@google/genai` +// dependency. OpenRouter wins over Gemini when OPENROUTER_API_KEY is set, so we +// don't need to clear the Gemini keys for the OpenRouter cases. + +function makeProjectWithImage(): string { + const dir = mkdtempSync(join(tmpdir(), "hf-caption-")); + mkdirSync(join(dir, "assets"), { recursive: true }); + // Contents are irrelevant to the OpenRouter path (it just base64-encodes the + // bytes); only the .png extension matters for the image filter. + writeFileSync(join(dir, "assets", "hero.png"), Buffer.from([0x89, 0x50, 0x4e, 0x47])); + return dir; +} + +describe("captionImagesWithGemini — OpenRouter provider", () => { + const dirs: string[] = []; + afterEach(() => { + vi.unstubAllGlobals(); + vi.unstubAllEnvs(); + for (const d of dirs) rmSync(d, { recursive: true, force: true }); + dirs.length = 0; + }); + + it("captions via OpenRouter when OPENROUTER_API_KEY is set", async () => { + const dir = makeProjectWithImage(); + dirs.push(dir); + vi.stubEnv("OPENROUTER_API_KEY", "or-test-key"); + vi.stubEnv("HYPERFRAMES_OPENROUTER_MODEL", "google/gemini-3.1-flash-lite"); + + const fetchMock = vi.fn( + async () => + new Response( + JSON.stringify({ choices: [{ message: { content: "A dark hero with blue accents." } }] }), + { status: 200, headers: { "content-type": "application/json" } }, + ), + ); + vi.stubGlobal("fetch", fetchMock); + + const warnings: string[] = []; + const captions = await captionImagesWithGemini(dir, () => {}, warnings); + + expect(captions).toEqual({ "hero.png": "A dark hero with blue accents." }); + expect(warnings).toEqual([]); + expect(fetchMock).toHaveBeenCalledTimes(1); + + const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; + expect(url).toBe("https://openrouter.ai/api/v1/chat/completions"); + expect((init.headers as Record).Authorization).toBe("Bearer or-test-key"); + const body = JSON.parse(init.body as string); + expect(body.model).toBe("google/gemini-3.1-flash-lite"); + const parts = body.messages[0].content as Array<{ type: string; image_url?: { url: string } }>; + const image = parts.find((p) => p.type === "image_url"); + expect(image?.image_url?.url).toMatch(/^data:image\/png;base64,/); + }); + + it("degrades gracefully (no throw, no captions) when OpenRouter returns a non-OK status", async () => { + const dir = makeProjectWithImage(); + dirs.push(dir); + vi.stubEnv("OPENROUTER_API_KEY", "or-bad-key"); + + vi.stubGlobal( + "fetch", + vi.fn( + async () => new Response("invalid api key", { status: 401, statusText: "Unauthorized" }), + ), + ); + + const warnings: string[] = []; + // captionOne throws on !res.ok, but the throw is per-image inside + // Promise.allSettled, so it's filtered out as a rejected result rather than + // bubbling up — same silent degradation as the existing Gemini path. + const captions = await captionImagesWithGemini(dir, () => {}, warnings); + + expect(captions).toEqual({}); + }); + + it("skips captioning entirely when no provider key is present", async () => { + const dir = makeProjectWithImage(); + dirs.push(dir); + vi.stubEnv("OPENROUTER_API_KEY", ""); + vi.stubEnv("GEMINI_API_KEY", ""); + vi.stubEnv("GOOGLE_API_KEY", ""); + + const fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + + const warnings: string[] = []; + const captions = await captionImagesWithGemini(dir, () => {}, warnings); + + expect(captions).toEqual({}); + expect(fetchMock).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/cli/src/capture/contentExtractor.ts b/packages/cli/src/capture/contentExtractor.ts index 8fdd1681e1..29009fd827 100644 --- a/packages/cli/src/capture/contentExtractor.ts +++ b/packages/cli/src/capture/contentExtractor.ts @@ -1,7 +1,7 @@ /** * Content extraction helpers for the website capture pipeline. * - * Handles library detection, visible text extraction, Gemini captioning, + * Handles library detection, visible text extraction, vision captioning, * and asset description generation. * * All page.evaluate() calls use string expressions to avoid @@ -156,7 +156,11 @@ export async function extractVisibleText(page: Page): Promise { } /** - * Caption downloaded images using Gemini vision API. + * Caption downloaded images using a vision model. + * + * Provider is chosen by which API key is present: OPENROUTER_API_KEY → OpenRouter + * (any vision model via its OpenAI-style API), else GEMINI_API_KEY/GOOGLE_API_KEY + * → Google Gemini, else no captioning. OpenRouter wins if both are set. * * Batches requests to stay under free-tier rate limits. * Returns a map of filename -> caption string. @@ -167,26 +171,90 @@ export async function captionImagesWithGemini( warnings: string[], ): Promise> { const geminiCaptions: Record = {}; + const openRouterKey = process.env.OPENROUTER_API_KEY; const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY; - if (!geminiKey) return geminiCaptions; + if (!openRouterKey && !geminiKey) return geminiCaptions; + + // OpenRouter takes priority when both keys are set — it's the explicit opt-in + // for users without Google access. Both providers satisfy the same + // single-image → one-line-caption contract (`captionOne`), so the batching and + // SVG-rasterization loops below stay provider-agnostic. + const useOpenRouter = Boolean(openRouterKey); + const providerName = useOpenRouter ? "OpenRouter" : "Gemini"; + // Default mirrors the Gemini path's tier (3.x flash-lite). Override per + // provider via HYPERFRAMES_OPENROUTER_MODEL / HYPERFRAMES_GEMINI_MODEL. + const model = useOpenRouter + ? process.env.HYPERFRAMES_OPENROUTER_MODEL || "google/gemini-3.1-flash-lite" + : process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview"; - progress("design", "Captioning images with Gemini vision..."); + progress("design", `Captioning images with ${providerName} vision...`); try { - const { GoogleGenAI } = await import("@google/genai"); - const ai = new GoogleGenAI({ apiKey: geminiKey }); + // One image → one short caption. Each provider implements this contract; + // everything below is provider-agnostic. + type CaptionOne = (args: { + mimeType: string; + base64: string; + prompt: string; + maxTokens: number; + }) => Promise; + + let captionOne: CaptionOne; + if (openRouterKey) { + captionOne = async ({ mimeType, base64, prompt, maxTokens }) => { + const res = await fetch("https://openrouter.ai/api/v1/chat/completions", { + method: "POST", + headers: { + Authorization: `Bearer ${openRouterKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + messages: [ + { + role: "user", + content: [ + { type: "text", text: prompt }, + { type: "image_url", image_url: { url: `data:${mimeType};base64,${base64}` } }, + ], + }, + ], + max_tokens: maxTokens, + }), + }); + if (!res.ok) { + const detail = await res.text().catch(() => ""); + throw new Error(`OpenRouter ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`); + } + const data = (await res.json()) as { + choices?: Array<{ message?: { content?: string } }>; + }; + return data.choices?.[0]?.message?.content?.trim() || ""; + }; + } else { + // Unreachable when geminiKey is unset (guarded above); re-narrow for TS. + if (!geminiKey) return geminiCaptions; + const { GoogleGenAI } = await import("@google/genai"); + const ai = new GoogleGenAI({ apiKey: geminiKey }); + captionOne = async ({ mimeType, base64, prompt, maxTokens }) => { + const response = await ai.models.generateContent({ + model, + contents: [ + { role: "user", parts: [{ inlineData: { mimeType, data: base64 } }, { text: prompt }] }, + ], + config: { maxOutputTokens: maxTokens }, + }); + return response.text?.trim() || ""; + }; + } + const imageFiles = readdirSync(join(outputDir, "assets")).filter((f: string) => /\.(png|jpg|jpeg|webp|gif)$/i.test(f), ); - // Caption in parallel batches via Gemini vision API. - // Free tier: 5 RPM → batch 5, 12s pause (~$0 but slow) - // Paid tier: 2000 RPM → batch 20, 1s pause (~$0.001/image, fast) - // We try a larger batch first; if rate-limited, fall back to smaller batches. - // Default is a preview model — update when GA ships. - // Benchmark (49 images, paid tier): 3.1-flash-lite-preview ~507ms/img 131ch avg, - // 2.5-flash-lite ~230ms/img 117ch avg. Preview has richer captions but higher variance. - // Override: HYPERFRAMES_GEMINI_MODEL=gemini-2.5-flash-lite - const model = process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview"; + // Caption in parallel batches. Gemini free tier is ~5 RPM (slow but $0), + // paid/OpenRouter ~2000 RPM. We batch 20 with a 2s inter-batch pause and rely + // on Promise.allSettled so a rate-limited image degrades to "" rather than + // failing the batch. const BATCH_SIZE = 20; for (let i = 0; i < imageFiles.length; i += BATCH_SIZE) { const batch = imageFiles.slice(i, i + BATCH_SIZE); @@ -194,27 +262,19 @@ export async function captionImagesWithGemini( batch.map(async (file: string) => { const filePath = join(outputDir, "assets", file); const stat = statSync(filePath); - if (stat.size > 4_000_000) return { file, caption: "" }; // skip images > 4 MB (Gemini inline limit) + if (stat.size > 4_000_000) return { file, caption: "" }; // skip images > 4 MB (provider inline limit) const buffer = readFileSync(filePath); const base64 = buffer.toString("base64"); const ext = file.split(".").pop()?.toLowerCase() || "png"; const mimeType = ext === "jpg" ? "image/jpeg" : `image/${ext}`; - const response = await ai.models.generateContent({ - model, - contents: [ - { - role: "user", - parts: [ - { inlineData: { mimeType, data: base64 } }, - { - text: "Describe this website image in ONE short sentence for a video storyboard. Focus on: what it shows, dominant colors, whether background is light or dark. Be factual, not creative.", - }, - ], - }, - ], - config: { maxOutputTokens: 500 }, + const caption = await captionOne({ + mimeType, + base64, + prompt: + "Describe this website image in ONE short sentence for a video storyboard. Focus on: what it shows, dominant colors, whether background is light or dark. Be factual, not creative.", + maxTokens: 500, }); - return { file, caption: response.text?.trim() || "" }; + return { file, caption }; }), ); for (const result of results) { @@ -231,7 +291,10 @@ export async function captionImagesWithGemini( `Captioned ${Math.min(i + BATCH_SIZE, imageFiles.length)}/${imageFiles.length} images...`, ); } - progress("design", `${Object.keys(geminiCaptions).length} images captioned with Gemini`); + progress( + "design", + `${Object.keys(geminiCaptions).length} images captioned with ${providerName}`, + ); // Rasterize SVGs to PNG before captioning — Vision hallucinates wordmarks when reading SVG path text. const svgFiles: Array<{ file: string; relPath: string }> = []; @@ -301,26 +364,17 @@ export async function captionImagesWithGemini( svgsSkipped++; return { file: relPath, caption: "" }; } - const response = await ai.models.generateContent({ - model, - contents: [ - { - role: "user", - parts: [ - { inlineData: { mimeType: "image/png", data: pngBase64 } }, - { - text: - "Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " + - "Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " + - "If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " + - "Be factual.", - }, - ], - }, - ], - config: { maxOutputTokens: 300 }, + const caption = await captionOne({ + mimeType: "image/png", + base64: pngBase64, + prompt: + "Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " + + "Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " + + "If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " + + "Be factual.", + maxTokens: 300, }); - return { file: relPath, caption: response.text?.trim() || "" }; + return { file: relPath, caption }; }), ); for (const result of results) { @@ -345,7 +399,7 @@ export async function captionImagesWithGemini( } } } catch (err) { - warnings.push(`Gemini captioning failed: ${err}`); + warnings.push(`${providerName} captioning failed: ${err}`); } return geminiCaptions; From 3ba9b03096245427a7cbd4030f383db18778ec55 Mon Sep 17 00:00:00 2001 From: James Date: Tue, 16 Jun 2026 02:48:32 +0000 Subject: [PATCH 2/2] =?UTF-8?q?test(cli):=20fix=20typecheck=20in=20OpenRou?= =?UTF-8?q?ter=20caption=20test=20=E2=80=94=20capture=20request=20without?= =?UTF-8?q?=20`as`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test cast `fetchMock.mock.calls[0]` to a tuple (TS2352: `[] | undefined` doesn't overlap `[string, RequestInit]`), which failed the Typecheck CI job. Capture the url/init inside the typed mock and assert via `new Headers()` + `typeof` narrowing instead — no `as` assertions (which the repo bans anyway). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../cli/src/capture/contentExtractor.test.ts | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/packages/cli/src/capture/contentExtractor.test.ts b/packages/cli/src/capture/contentExtractor.test.ts index 44698a6893..9683653848 100644 --- a/packages/cli/src/capture/contentExtractor.test.ts +++ b/packages/cli/src/capture/contentExtractor.test.ts @@ -33,13 +33,18 @@ describe("captionImagesWithGemini — OpenRouter provider", () => { vi.stubEnv("OPENROUTER_API_KEY", "or-test-key"); vi.stubEnv("HYPERFRAMES_OPENROUTER_MODEL", "google/gemini-3.1-flash-lite"); - const fetchMock = vi.fn( - async () => - new Response( - JSON.stringify({ choices: [{ message: { content: "A dark hero with blue accents." } }] }), - { status: 200, headers: { "content-type": "application/json" } }, - ), - ); + // Capture the request inside the mock, where the args are well-typed — + // avoids casting `mock.calls` (and the repo's ban on `as` assertions). + let capturedUrl: string | undefined; + let capturedInit: RequestInit | undefined; + const fetchMock = vi.fn(async (url: string, init?: RequestInit) => { + capturedUrl = url; + capturedInit = init; + return new Response( + JSON.stringify({ choices: [{ message: { content: "A dark hero with blue accents." } }] }), + { status: 200, headers: { "content-type": "application/json" } }, + ); + }); vi.stubGlobal("fetch", fetchMock); const warnings: string[] = []; @@ -49,13 +54,11 @@ describe("captionImagesWithGemini — OpenRouter provider", () => { expect(warnings).toEqual([]); expect(fetchMock).toHaveBeenCalledTimes(1); - const [url, init] = fetchMock.mock.calls[0] as [string, RequestInit]; - expect(url).toBe("https://openrouter.ai/api/v1/chat/completions"); - expect((init.headers as Record).Authorization).toBe("Bearer or-test-key"); - const body = JSON.parse(init.body as string); + expect(capturedUrl).toBe("https://openrouter.ai/api/v1/chat/completions"); + expect(new Headers(capturedInit?.headers).get("authorization")).toBe("Bearer or-test-key"); + const body = JSON.parse(typeof capturedInit?.body === "string" ? capturedInit.body : "{}"); expect(body.model).toBe("google/gemini-3.1-flash-lite"); - const parts = body.messages[0].content as Array<{ type: string; image_url?: { url: string } }>; - const image = parts.find((p) => p.type === "image_url"); + const image = body.messages[0].content.find((p: { type: string }) => p.type === "image_url"); expect(image?.image_url?.url).toMatch(/^data:image\/png;base64,/); });