Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 190 additions & 7 deletions packages/opencode/src/tool/webfetch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,143 @@ const BROWSER_UA =
// Status codes that warrant a retry with a different User-Agent
const RETRYABLE_STATUSES = new Set([403, 406])

// altimate_change start — session-level URL failure cache (#471)
// Prevents repeated fetches to URLs that already returned 404/410 in this session.
// altimate_change start — process-level URL failure cache (#471)
// Prevents repeated fetches to URLs that already returned 404/410 in this
// process. Cache is module-scope so in daemon mode (`altimate serve`) and
// long-running MCP servers it persists across logical sessions — fine for
// 404/410 (resource-permanent) but worth flagging for 451 (geo/legal block,
// observer-conditional). See cacheUrlFailure comment for TTL rationale.
//
// 2026-05-22 follow-up to telemetry-2026-05-21 (486 residual webfetch 404s,
// down from March's 2,222 but still meaningful):
// - Extended TTL from 5 min to 30 min. URLs that 404 rarely self-heal
// within a session, and many sessions run longer than 5 min so the
// short window was letting agents re-hit the same dead URL multiple
// times in one session.
// - Cache key is the URL with tracking-only params (utm_*, fbclid,
// gclid, mc_*, _ga, _gl, igshid, mibextid, __cf_chl_*, plus HubSpot/
// Marketo/Adobe/Matomo variants) stripped, so LLM-generated tracking
// variations of a known-bad URL all hit cache. Functional query params
// (page, id, q, ref, etc.) are preserved — see exclusions block below
// for the rationale on `ref` specifically.
const failedUrls = new Map<string, { status: number; timestamp: number }>()
const FAILURE_CACHE_TTL = 5 * 60 * 1000 // 5 minutes
const FAILURE_CACHE_TTL = 30 * 60 * 1000 // 30 minutes — 404/410 (resource-permanent)
const FAILURE_CACHE_TTL_451 = 5 * 60 * 1000 // 5 minutes — 451 is observer-conditional (VPN swap, travel)

/**
* Param names that are universally tracking-only — stripping them never
* changes the document the URL points to. Conservative list; favoured over
* a regex sweep because false positives (e.g. stripping `?page=2`) would
* cause real fetches to incorrectly hit the cache.
*/
// Notable exclusions (do NOT add these to TRACKING_PARAMS):
// - `ref` / `ref_src` / `ref_url`: functional on GitHub raw URLs and
// git-hosting APIs (`?ref=main` vs `?ref=v2.0` selects a different
// branch/tag). Stripping `ref` would suppress legitimate fetches of
// different refs against the same path — a coding agent fetches GitHub
// heavily, so this is a real false-positive risk.
// - `referrer` (full word) is tracking-only and IS in the list; the
// abbreviated `ref` is not.
//
// The utm_* family is enumerated explicitly here AND covered by the
// `isTrackingParamPrefix` check below. The duplication is intentional —
// the explicit list documents the named params we know about, the prefix
// check catches any future utm_* variant we haven't enumerated.
// Cloudflare challenge tokens (`__cf_chl_*`) are handled by the prefix
// check exclusively; no explicit entries needed.
const TRACKING_PARAMS = new Set([
"utm_source",
"utm_medium",
"utm_campaign",
"utm_term",
"utm_content",
"utm_id",
"referrer",
"fbclid",
"gclid",
"dclid",
"msclkid",
"yclid",
"twclid",
"li_fat_id",
"mc_cid",
"mc_eid",
"_ga",
"_gl",
"igshid",
"mibextid",
"vero_conv",
"vero_id",
// HubSpot
"_hsenc",
"_hsmi",
"hsCtaTracking",
// Marketo
"mkt_tok",
// Adobe Analytics / SiteCatalyst
"s_cid",
"s_kwcid",
// Piwik / Matomo
"pk_campaign",
"pk_kwd",
"pk_source",
"pk_medium",
"pk_content",
"piwik_campaign",
"piwik_keyword",
])

function isTrackingParamPrefix(name: string): boolean {
return name.startsWith("__cf_chl_") || name.startsWith("utm_")
}

/**
* Normalize a URL for cache lookup. Strips tracking-only query params, sorts
* the remaining params for stable ordering, and lowercases the host. Returns
* the input unchanged if URL parsing fails.
*/
export function normalizeUrlForCache(url: string): string {
try {
const parsed = new URL(url)
parsed.hostname = parsed.hostname.toLowerCase()
// Strip userinfo (basic-auth `user:pass@`) — it doesn't change the
// resource the URL points to, and leaving it in lets the same logical
// URL with/without credentials occupy two cache slots. Also a minor
// hygiene win for cache keys that get logged in telemetry.
parsed.username = ""
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1: Do not strip URL userinfo from the failure-cache key; different credentials can legitimately produce different results, and this can cause false cached 404/410/451 misses for authenticated URLs.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At packages/opencode/src/tool/webfetch.ts, line 114:

<comment>Do not strip URL userinfo from the failure-cache key; different credentials can legitimately produce different results, and this can cause false cached 404/410/451 misses for authenticated URLs.</comment>

<file context>
@@ -90,6 +107,12 @@ export function normalizeUrlForCache(url: string): string {
+    // resource the URL points to, and leaving it in lets the same logical
+    // URL with/without credentials occupy two cache slots. Also a minor
+    // hygiene win for cache keys that get logged in telemetry.
+    parsed.username = ""
+    parsed.password = ""
     const kept: [string, string][] = []
</file context>

parsed.password = ""
const kept: [string, string][] = []
for (const [k, v] of parsed.searchParams) {
if (TRACKING_PARAMS.has(k) || isTrackingParamPrefix(k)) continue
kept.push([k, v])
}
// Sort by (key, value) so duplicate-key params produce a stable order:
// `?a=1&a=2` and `?a=2&a=1` normalize to the same string. Sorting by key
// alone leaked the original insertion order between same-key entries.
kept.sort(([ak, av], [bk, bv]) => {
if (ak !== bk) return ak < bk ? -1 : 1
return av < bv ? -1 : av > bv ? 1 : 0
})
parsed.search = ""
for (const [k, v] of kept) parsed.searchParams.append(k, v)
// Also strip fragment — fragments don't change the fetched resource.
parsed.hash = ""
return parsed.toString()
} catch {
return url
}
}

function ttlForStatus(status: number): number {
return status === 451 ? FAILURE_CACHE_TTL_451 : FAILURE_CACHE_TTL
}

function isUrlCachedFailure(url: string): { status: number } | null {
const entry = failedUrls.get(url)
const key = normalizeUrlForCache(url)
const entry = failedUrls.get(key)
if (!entry) return null
if (Date.now() - entry.timestamp > FAILURE_CACHE_TTL) {
failedUrls.delete(url)
if (Date.now() - entry.timestamp > ttlForStatus(entry.status)) {
failedUrls.delete(key)
return null
}
return { status: entry.status }
Expand All @@ -33,16 +160,72 @@ function isUrlCachedFailure(url: string): { status: number } | null {
const MAX_CACHED_URLS = 500

function cacheUrlFailure(url: string, status: number): void {
// TTL by status:
// 404 (resource not found) + 410 (gone) — 30 min. Both are
// resource-permanent; 410 is spec-permanent, 404 is overwhelmingly
// permanent in practice (typos, removed docs).
// 451 (unavailable for legal reasons) — 5 min. Observer-conditional:
// if the agent swaps networks (VPN, travel, mobile hotspot) the URL
// may become reachable from the new vantage point, so a long TTL is
// wrong. 5 min keeps the immediate-retry suppression but lets a
// network change unblock the URL.
if (status === 404 || status === 410 || status === 451) {
const key = normalizeUrlForCache(url)
// Re-touch semantics: if the key is already in the cache, delete first
// so the .set() reinserts at the tail of the FIFO. Otherwise a
// frequently-failing URL would stay at the head and be the first
// evicted under pressure — wrong order for an LRU-flavoured cache.
if (failedUrls.has(key)) failedUrls.delete(key)
if (failedUrls.size >= MAX_CACHED_URLS) {
// Evict oldest entry (Map preserves insertion order)
const oldest = failedUrls.keys().next().value
if (oldest) failedUrls.delete(oldest)
}
failedUrls.set(url, { status, timestamp: Date.now() })
failedUrls.set(key, { status, timestamp: Date.now() })
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
}

/**
* Reset the failure cache.
* @internal — only used by tests; do not call from production code.
*/
export function _resetFailureCache(): void {
failedUrls.clear()
}

/**
* Read the current failure cache size.
* @internal — only used by tests; do not call from production code.
*/
export function _failureCacheSize(): number {
return failedUrls.size
}

/**
* Drive the cache state machine from tests — same code path as production.
* @internal — only used by tests; do not call from production code.
*/
export function _cacheUrlFailureForTest(url: string, status: number): void {
cacheUrlFailure(url, status)
}

/**
* Read cache key insertion order — the order entries would be evicted under
* FIFO/LRU pressure. Returns normalized cache keys, not raw URLs.
* @internal — only used by tests; do not call from production code.
*/
export function _failureCacheKeysInOrder(): string[] {
return [...failedUrls.keys()]
}

/**
* Probe the cache for a URL — same code path as production lookups.
* @internal — only used by tests; do not call from production code.
*/
export function _isUrlCachedFailureForTest(url: string): { status: number } | null {
return isUrlCachedFailure(url)
}

/** Strip query string from URL to avoid leaking auth tokens in error messages. */
function sanitizeUrl(url: string): string {
try {
Expand Down
Loading
Loading