Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions docs/guides/website-to-video.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,22 @@ The prompt determines the format. Include a duration and creative direction:

## Enriching Captures with Gemini Vision

By default, captures describe assets using DOM context — alt text, nearby headings, CSS classes. Add a [Gemini API key](https://aistudio.google.com/apikey) for richer AI-powered descriptions using vision.
By default, captures describe assets using DOM context — alt text, nearby headings, CSS classes. Add a vision API key for richer AI-powered descriptions.

Create a `.env` file in your project root:
Create a `.env` file in your project root with **either** a [Gemini API key](https://aistudio.google.com/apikey):

```bash
echo "GEMINI_API_KEY=your-key-here" > .env
```

…**or**, if you don't have Google access, an [OpenRouter key](https://openrouter.ai/keys) — a single API that fronts many vision models:

```bash
echo "OPENROUTER_API_KEY=your-key-here" > .env
```

OpenRouter is used when its key is present (it takes priority if both are set). The default model is `google/gemini-3.1-flash-lite`; override it with `HYPERFRAMES_OPENROUTER_MODEL` (any vision-capable OpenRouter model), just as `HYPERFRAMES_GEMINI_MODEL` overrides the Gemini default.

<Tabs>
<Tab title="Without Gemini">
```
Expand Down
2 changes: 1 addition & 1 deletion docs/packages/cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ This is suppressed in CI environments, non-TTY shells, and when `HYPERFRAMES_NO_

Output is a self-contained directory with a `CLAUDE.md` file that any AI agent can read to understand the captured site. Used by the `/website-to-video` skill as step 1 of the video production pipeline.

Set `GEMINI_API_KEY` in a `.env` file for AI-powered image descriptions via Gemini vision (~$0.001/image). See the [Website to Video](/guides/website-to-video#enriching-captures-with-gemini-vision) guide for details.
Set `GEMINI_API_KEY` in a `.env` file for AI-powered image descriptions via Gemini vision (~$0.001/image), or set `OPENROUTER_API_KEY` to use any vision model through [OpenRouter](https://openrouter.ai) instead (takes priority if both are set; override the model with `HYPERFRAMES_OPENROUTER_MODEL`). See the [Website to Video](/guides/website-to-video#enriching-captures-with-gemini-vision) guide for details.

</Tab>
<Tab title="Preview">
Expand Down
102 changes: 102 additions & 0 deletions packages/cli/src/capture/contentExtractor.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import { afterEach, describe, expect, it, vi } from "vitest";
import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { captionImagesWithGemini } from "./contentExtractor.js";

// These tests exercise the OpenRouter provider path only — it makes a plain
// `fetch` call we can stub, with no native (`sharp`) or `@google/genai`
// dependency. OpenRouter wins over Gemini when OPENROUTER_API_KEY is set, so we
// don't need to clear the Gemini keys for the OpenRouter cases.

function makeProjectWithImage(): string {
const dir = mkdtempSync(join(tmpdir(), "hf-caption-"));
mkdirSync(join(dir, "assets"), { recursive: true });
// Contents are irrelevant to the OpenRouter path (it just base64-encodes the
// bytes); only the .png extension matters for the image filter.
writeFileSync(join(dir, "assets", "hero.png"), Buffer.from([0x89, 0x50, 0x4e, 0x47]));
return dir;
}

describe("captionImagesWithGemini — OpenRouter provider", () => {
const dirs: string[] = [];
afterEach(() => {
vi.unstubAllGlobals();
vi.unstubAllEnvs();
for (const d of dirs) rmSync(d, { recursive: true, force: true });
dirs.length = 0;
});

it("captions via OpenRouter when OPENROUTER_API_KEY is set", async () => {
const dir = makeProjectWithImage();
dirs.push(dir);
vi.stubEnv("OPENROUTER_API_KEY", "or-test-key");
vi.stubEnv("HYPERFRAMES_OPENROUTER_MODEL", "google/gemini-3.1-flash-lite");

// Capture the request inside the mock, where the args are well-typed —
// avoids casting `mock.calls` (and the repo's ban on `as` assertions).
let capturedUrl: string | undefined;
let capturedInit: RequestInit | undefined;
const fetchMock = vi.fn(async (url: string, init?: RequestInit) => {
capturedUrl = url;
capturedInit = init;
return new Response(
JSON.stringify({ choices: [{ message: { content: "A dark hero with blue accents." } }] }),
{ status: 200, headers: { "content-type": "application/json" } },
);
});
vi.stubGlobal("fetch", fetchMock);

const warnings: string[] = [];
const captions = await captionImagesWithGemini(dir, () => {}, warnings);

expect(captions).toEqual({ "hero.png": "A dark hero with blue accents." });
expect(warnings).toEqual([]);
expect(fetchMock).toHaveBeenCalledTimes(1);

expect(capturedUrl).toBe("https://openrouter.ai/api/v1/chat/completions");
expect(new Headers(capturedInit?.headers).get("authorization")).toBe("Bearer or-test-key");
const body = JSON.parse(typeof capturedInit?.body === "string" ? capturedInit.body : "{}");
expect(body.model).toBe("google/gemini-3.1-flash-lite");
const image = body.messages[0].content.find((p: { type: string }) => p.type === "image_url");
expect(image?.image_url?.url).toMatch(/^data:image\/png;base64,/);
});

it("degrades gracefully (no throw, no captions) when OpenRouter returns a non-OK status", async () => {
const dir = makeProjectWithImage();
dirs.push(dir);
vi.stubEnv("OPENROUTER_API_KEY", "or-bad-key");

vi.stubGlobal(
"fetch",
vi.fn(
async () => new Response("invalid api key", { status: 401, statusText: "Unauthorized" }),
),
);

const warnings: string[] = [];
// captionOne throws on !res.ok, but the throw is per-image inside
// Promise.allSettled, so it's filtered out as a rejected result rather than
// bubbling up — same silent degradation as the existing Gemini path.
const captions = await captionImagesWithGemini(dir, () => {}, warnings);

expect(captions).toEqual({});
});

it("skips captioning entirely when no provider key is present", async () => {
const dir = makeProjectWithImage();
dirs.push(dir);
vi.stubEnv("OPENROUTER_API_KEY", "");
vi.stubEnv("GEMINI_API_KEY", "");
vi.stubEnv("GOOGLE_API_KEY", "");

const fetchMock = vi.fn();
vi.stubGlobal("fetch", fetchMock);

const warnings: string[] = [];
const captions = await captionImagesWithGemini(dir, () => {}, warnings);

expect(captions).toEqual({});
expect(fetchMock).not.toHaveBeenCalled();
});
});
158 changes: 106 additions & 52 deletions packages/cli/src/capture/contentExtractor.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* Content extraction helpers for the website capture pipeline.
*
* Handles library detection, visible text extraction, Gemini captioning,
* Handles library detection, visible text extraction, vision captioning,
* and asset description generation.
*
* All page.evaluate() calls use string expressions to avoid
Expand Down Expand Up @@ -156,7 +156,11 @@
}

/**
* Caption downloaded images using Gemini vision API.
* Caption downloaded images using a vision model.
*
* Provider is chosen by which API key is present: OPENROUTER_API_KEY → OpenRouter
* (any vision model via its OpenAI-style API), else GEMINI_API_KEY/GOOGLE_API_KEY
* → Google Gemini, else no captioning. OpenRouter wins if both are set.
*
* Batches requests to stay under free-tier rate limits.
* Returns a map of filename -> caption string.
Expand All @@ -167,54 +171,110 @@
warnings: string[],
): Promise<Record<string, string>> {
const geminiCaptions: Record<string, string> = {};
const openRouterKey = process.env.OPENROUTER_API_KEY;
const geminiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY;
if (!geminiKey) return geminiCaptions;
if (!openRouterKey && !geminiKey) return geminiCaptions;

// OpenRouter takes priority when both keys are set — it's the explicit opt-in
// for users without Google access. Both providers satisfy the same
// single-image → one-line-caption contract (`captionOne`), so the batching and
// SVG-rasterization loops below stay provider-agnostic.
const useOpenRouter = Boolean(openRouterKey);
const providerName = useOpenRouter ? "OpenRouter" : "Gemini";
// Default mirrors the Gemini path's tier (3.x flash-lite). Override per
// provider via HYPERFRAMES_OPENROUTER_MODEL / HYPERFRAMES_GEMINI_MODEL.
const model = useOpenRouter
? process.env.HYPERFRAMES_OPENROUTER_MODEL || "google/gemini-3.1-flash-lite"
: process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview";

progress("design", "Captioning images with Gemini vision...");
progress("design", `Captioning images with ${providerName} vision...`);
try {
const { GoogleGenAI } = await import("@google/genai");
const ai = new GoogleGenAI({ apiKey: geminiKey });
// One image → one short caption. Each provider implements this contract;
// everything below is provider-agnostic.
type CaptionOne = (args: {
mimeType: string;
base64: string;
prompt: string;
maxTokens: number;
}) => Promise<string>;

let captionOne: CaptionOne;
if (openRouterKey) {
captionOne = async ({ mimeType, base64, prompt, maxTokens }) => {
const res = await fetch("https://openrouter.ai/api/v1/chat/completions", {
method: "POST",
headers: {
Authorization: `Bearer ${openRouterKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image_url", image_url: { url: `data:${mimeType};base64,${base64}` } },
],
},
],
max_tokens: maxTokens,
}),

Check warning

Code scanning / CodeQL

File data in outbound network request Medium

Outbound network request depends on
file data
.
Comment thread
jrusso1020 marked this conversation as resolved.
Dismissed
});
if (!res.ok) {
const detail = await res.text().catch(() => "");
throw new Error(`OpenRouter ${res.status} ${res.statusText}: ${detail.slice(0, 200)}`);
}
const data = (await res.json()) as {
choices?: Array<{ message?: { content?: string } }>;
};
return data.choices?.[0]?.message?.content?.trim() || "";
};
} else {
// Unreachable when geminiKey is unset (guarded above); re-narrow for TS.
if (!geminiKey) return geminiCaptions;
const { GoogleGenAI } = await import("@google/genai");
const ai = new GoogleGenAI({ apiKey: geminiKey });
captionOne = async ({ mimeType, base64, prompt, maxTokens }) => {
const response = await ai.models.generateContent({
model,
contents: [
{ role: "user", parts: [{ inlineData: { mimeType, data: base64 } }, { text: prompt }] },
],
config: { maxOutputTokens: maxTokens },
});
return response.text?.trim() || "";
};
}

const imageFiles = readdirSync(join(outputDir, "assets")).filter((f: string) =>
/\.(png|jpg|jpeg|webp|gif)$/i.test(f),
);

// Caption in parallel batches via Gemini vision API.
// Free tier: 5 RPM → batch 5, 12s pause (~$0 but slow)
// Paid tier: 2000 RPM → batch 20, 1s pause (~$0.001/image, fast)
// We try a larger batch first; if rate-limited, fall back to smaller batches.
// Default is a preview model — update when GA ships.
// Benchmark (49 images, paid tier): 3.1-flash-lite-preview ~507ms/img 131ch avg,
// 2.5-flash-lite ~230ms/img 117ch avg. Preview has richer captions but higher variance.
// Override: HYPERFRAMES_GEMINI_MODEL=gemini-2.5-flash-lite
const model = process.env.HYPERFRAMES_GEMINI_MODEL || "gemini-3.1-flash-lite-preview";
// Caption in parallel batches. Gemini free tier is ~5 RPM (slow but $0),
// paid/OpenRouter ~2000 RPM. We batch 20 with a 2s inter-batch pause and rely
// on Promise.allSettled so a rate-limited image degrades to "" rather than
// failing the batch.
const BATCH_SIZE = 20;
for (let i = 0; i < imageFiles.length; i += BATCH_SIZE) {
const batch = imageFiles.slice(i, i + BATCH_SIZE);
const results = await Promise.allSettled(
batch.map(async (file: string) => {
const filePath = join(outputDir, "assets", file);
const stat = statSync(filePath);
if (stat.size > 4_000_000) return { file, caption: "" }; // skip images > 4 MB (Gemini inline limit)
if (stat.size > 4_000_000) return { file, caption: "" }; // skip images > 4 MB (provider inline limit)
const buffer = readFileSync(filePath);
const base64 = buffer.toString("base64");
const ext = file.split(".").pop()?.toLowerCase() || "png";
const mimeType = ext === "jpg" ? "image/jpeg" : `image/${ext}`;
const response = await ai.models.generateContent({
model,
contents: [
{
role: "user",
parts: [
{ inlineData: { mimeType, data: base64 } },
{
text: "Describe this website image in ONE short sentence for a video storyboard. Focus on: what it shows, dominant colors, whether background is light or dark. Be factual, not creative.",
},
],
},
],
config: { maxOutputTokens: 500 },
const caption = await captionOne({
mimeType,
base64,
prompt:
"Describe this website image in ONE short sentence for a video storyboard. Focus on: what it shows, dominant colors, whether background is light or dark. Be factual, not creative.",
maxTokens: 500,
});
return { file, caption: response.text?.trim() || "" };
return { file, caption };
}),
);
for (const result of results) {
Expand All @@ -231,7 +291,10 @@
`Captioned ${Math.min(i + BATCH_SIZE, imageFiles.length)}/${imageFiles.length} images...`,
);
}
progress("design", `${Object.keys(geminiCaptions).length} images captioned with Gemini`);
progress(
"design",
`${Object.keys(geminiCaptions).length} images captioned with ${providerName}`,
);

// Rasterize SVGs to PNG before captioning — Vision hallucinates wordmarks when reading SVG path text.
const svgFiles: Array<{ file: string; relPath: string }> = [];
Expand Down Expand Up @@ -301,26 +364,17 @@
svgsSkipped++;
return { file: relPath, caption: "" };
}
const response = await ai.models.generateContent({
model,
contents: [
{
role: "user",
parts: [
{ inlineData: { mimeType: "image/png", data: pngBase64 } },
{
text:
"Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " +
"Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " +
"If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " +
"Be factual.",
},
],
},
],
config: { maxOutputTokens: 300 },
const caption = await captionOne({
mimeType: "image/png",
base64: pngBase64,
prompt:
"Describe this SVG asset rendered from a website in ONE short sentence for a video storyboard. " +
"Focus on: what shape/icon/illustration/wordmark it is, its colors, any text it contains. " +
"If you see a wordmark, READ THE LETTERS LITERALLY — do not guess a brand from context. " +
"Be factual.",
maxTokens: 300,
});
return { file: relPath, caption: response.text?.trim() || "" };
return { file: relPath, caption };
}),
);
for (const result of results) {
Expand All @@ -345,7 +399,7 @@
}
}
} catch (err) {
warnings.push(`Gemini captioning failed: ${err}`);
warnings.push(`${providerName} captioning failed: ${err}`);
}

return geminiCaptions;
Expand Down
Loading