diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index bbb8a2a..ab3058c 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -51,6 +51,7 @@ config = { workspace = true } derive_more = { workspace = true } error-stack = { workspace = true } http = { workspace = true } +log = { workspace = true } regex = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/crates/common/src/host_rewrite.rs b/crates/common/src/host_rewrite.rs new file mode 100644 index 0000000..cb15f40 --- /dev/null +++ b/crates/common/src/host_rewrite.rs @@ -0,0 +1,49 @@ +/// Rewrite bare host occurrences (e.g. `origin.example.com/news`) only when the match is a full +/// hostname token, not part of a larger hostname like `cdn.origin.example.com`. +/// +/// This is used by both HTML (`__next_f` payloads) and Flight (`text/x-component`) rewriting to +/// avoid corrupting unrelated hostnames. +pub(crate) fn rewrite_bare_host_at_boundaries( + text: &str, + origin_host: &str, + request_host: &str, +) -> Option { + if origin_host.is_empty() || request_host.is_empty() || !text.contains(origin_host) { + return None; + } + + fn is_host_char(byte: u8) -> bool { + byte.is_ascii_alphanumeric() || matches!(byte, b'.' | b'-' | b':') + } + + let origin_len = origin_host.len(); + let bytes = text.as_bytes(); + let mut out = String::with_capacity(text.len()); + let mut search = 0; + let mut replaced_any = false; + + while let Some(rel) = text[search..].find(origin_host) { + let pos = search + rel; + let end = pos + origin_len; + + let before_ok = pos == 0 || !is_host_char(bytes[pos - 1]); + let after_ok = end == bytes.len() || !is_host_char(bytes[end]); + + if before_ok && after_ok { + out.push_str(&text[search..pos]); + out.push_str(request_host); + replaced_any = true; + search = end; + } else { + out.push_str(&text[search..pos + 1]); + search = pos + 1; + } + } + + if !replaced_any { + return None; + } + + out.push_str(&text[search..]); + Some(out) +} diff --git a/crates/common/src/html_processor.rs b/crates/common/src/html_processor.rs index a0ac143..1803436 100644 --- a/crates/common/src/html_processor.rs +++ b/crates/common/src/html_processor.rs @@ -2,18 +2,87 @@ //! //! This module provides a StreamProcessor implementation for HTML content. use std::cell::Cell; +use std::io; use std::rc::Rc; +use std::sync::Arc; use lol_html::{element, html_content::ContentType, text, Settings as RewriterSettings}; use crate::integrations::{ - AttributeRewriteOutcome, IntegrationAttributeContext, IntegrationRegistry, + AttributeRewriteOutcome, IntegrationAttributeContext, IntegrationDocumentState, + IntegrationHtmlContext, IntegrationHtmlPostProcessor, IntegrationRegistry, IntegrationScriptContext, ScriptRewriteAction, }; use crate::settings::Settings; use crate::streaming_processor::{HtmlRewriterAdapter, StreamProcessor}; use crate::tsjs; +struct HtmlWithPostProcessing { + inner: HtmlRewriterAdapter, + post_processors: Vec>, + origin_host: String, + request_host: String, + request_scheme: String, + document_state: IntegrationDocumentState, +} + +impl StreamProcessor for HtmlWithPostProcessing { + fn process_chunk(&mut self, chunk: &[u8], is_last: bool) -> Result, io::Error> { + let output = self.inner.process_chunk(chunk, is_last)?; + if !is_last || output.is_empty() || self.post_processors.is_empty() { + return Ok(output); + } + + let Ok(output_str) = std::str::from_utf8(&output) else { + return Ok(output); + }; + + let ctx = IntegrationHtmlContext { + request_host: &self.request_host, + request_scheme: &self.request_scheme, + origin_host: &self.origin_host, + document_state: &self.document_state, + }; + + // Preflight to avoid allocating a `String` unless at least one post-processor wants to run. + if !self + .post_processors + .iter() + .any(|p| p.should_process(output_str, &ctx)) + { + return Ok(output); + } + + let mut html = String::from_utf8(output).map_err(|e| { + io::Error::other(format!( + "HTML post-processing expected valid UTF-8 output: {e}" + )) + })?; + + let mut changed = false; + for processor in &self.post_processors { + if processor.should_process(&html, &ctx) { + changed |= processor.post_process(&mut html, &ctx); + } + } + + if changed { + log::debug!( + "HTML post-processing complete: origin_host={}, output_len={}", + self.origin_host, + html.len() + ); + } + + Ok(html.into_bytes()) + } + + fn reset(&mut self) { + self.inner.reset(); + self.document_state.clear(); + } +} + /// Configuration for HTML processing #[derive(Clone)] pub struct HtmlProcessorConfig { @@ -43,6 +112,9 @@ impl HtmlProcessorConfig { /// Create an HTML processor with URL replacement and optional Prebid injection pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcessor { + let post_processors = config.integrations.html_post_processors(); + let document_state = IntegrationDocumentState::default(); + // Simplified URL patterns structure - stores only core data and generates variants on-demand struct UrlPatterns { origin_host: String, @@ -70,6 +142,37 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso fn protocol_relative_replacement(&self) -> String { format!("//{}", self.request_host) } + + fn rewrite_url_value(&self, value: &str) -> Option { + if !value.contains(&self.origin_host) { + return None; + } + + let https_origin = self.https_origin(); + let http_origin = self.http_origin(); + let protocol_relative_origin = self.protocol_relative_origin(); + let replacement_url = self.replacement_url(); + let protocol_relative_replacement = self.protocol_relative_replacement(); + + let mut rewritten = value + .replace(&https_origin, &replacement_url) + .replace(&http_origin, &replacement_url) + .replace(&protocol_relative_origin, &protocol_relative_replacement); + + if rewritten.starts_with(&self.origin_host) { + let suffix = &rewritten[self.origin_host.len()..]; + let boundary_ok = suffix.is_empty() + || matches!( + suffix.as_bytes().first(), + Some(b'/') | Some(b'?') | Some(b'#') + ); + if boundary_ok { + rewritten = format!("{}{}", self.request_host, suffix); + } + } + + (rewritten != value).then_some(rewritten) + } } let patterns = Rc::new(UrlPatterns { @@ -102,11 +205,8 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso move |el| { if let Some(mut href) = el.get_attribute("href") { let original_href = href.clone(); - let new_href = href - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_href != href { - href = new_href; + if let Some(rewritten) = patterns.rewrite_url_value(&href) { + href = rewritten; } match integrations.rewrite_attribute( @@ -143,11 +243,8 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso move |el| { if let Some(mut src) = el.get_attribute("src") { let original_src = src.clone(); - let new_src = src - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_src != src { - src = new_src; + if let Some(rewritten) = patterns.rewrite_url_value(&src) { + src = rewritten; } match integrations.rewrite_attribute( "src", @@ -183,11 +280,8 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso move |el| { if let Some(mut action) = el.get_attribute("action") { let original_action = action.clone(); - let new_action = action - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_action != action { - action = new_action; + if let Some(rewritten) = patterns.rewrite_url_value(&action) { + action = rewritten; } match integrations.rewrite_attribute( @@ -314,15 +408,19 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso let selector = script_rewriter.selector(); let rewriter = script_rewriter.clone(); let patterns = patterns.clone(); + let document_state = document_state.clone(); element_content_handlers.push(text!(selector, { let rewriter = rewriter.clone(); let patterns = patterns.clone(); + let document_state = document_state.clone(); move |text| { let ctx = IntegrationScriptContext { selector, request_host: &patterns.request_host, request_scheme: &patterns.request_scheme, origin_host: &patterns.origin_host, + is_last_in_text_node: text.last_in_text_node(), + document_state: &document_state, }; match rewriter.rewrite(text.as_str(), &ctx) { ScriptRewriteAction::Keep => {} @@ -343,7 +441,14 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso ..RewriterSettings::default() }; - HtmlRewriterAdapter::new(rewriter_settings) + HtmlWithPostProcessing { + inner: HtmlRewriterAdapter::new(rewriter_settings), + post_processors, + origin_host: config.origin_host, + request_host: config.request_host, + request_scheme: config.request_scheme, + document_state, + } } #[cfg(test)] @@ -436,8 +541,12 @@ mod tests { let html = r#" Link + Proto + Bare +
+ "#; let mut output = Vec::new(); @@ -447,8 +556,12 @@ mod tests { let result = String::from_utf8(output).unwrap(); assert!(result.contains(r#"href="https://test.example.com/page""#)); + assert!(result.contains(r#"href="//test.example.com/proto""#)); + assert!(result.contains(r#"href="test.example.com/bare""#)); assert!(result.contains(r#"src="https://test.example.com/image.jpg""#)); + assert!(result.contains(r#"src="//test.example.com/image2.jpg""#)); assert!(result.contains(r#"action="https://test.example.com/submit""#)); + assert!(result.contains(r#"action="//test.example.com/submit2""#)); assert!(!result.contains("origin.example.com")); } diff --git a/crates/common/src/integrations/mod.rs b/crates/common/src/integrations/mod.rs index 888fa5a..5cc8fc2 100644 --- a/crates/common/src/integrations/mod.rs +++ b/crates/common/src/integrations/mod.rs @@ -10,7 +10,8 @@ pub mod testlight; pub use registry::{ AttributeRewriteAction, AttributeRewriteOutcome, IntegrationAttributeContext, - IntegrationAttributeRewriter, IntegrationEndpoint, IntegrationMetadata, IntegrationProxy, + IntegrationAttributeRewriter, IntegrationDocumentState, IntegrationEndpoint, + IntegrationHtmlContext, IntegrationHtmlPostProcessor, IntegrationMetadata, IntegrationProxy, IntegrationRegistration, IntegrationRegistrationBuilder, IntegrationRegistry, IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, }; diff --git a/crates/common/src/integrations/nextjs.rs b/crates/common/src/integrations/nextjs.rs deleted file mode 100644 index ad33302..0000000 --- a/crates/common/src/integrations/nextjs.rs +++ /dev/null @@ -1,388 +0,0 @@ -use std::sync::Arc; - -use regex::{escape, Regex}; -use serde::{Deserialize, Serialize}; -use validator::Validate; - -use crate::integrations::{ - IntegrationRegistration, IntegrationScriptContext, IntegrationScriptRewriter, - ScriptRewriteAction, -}; -use crate::settings::{IntegrationConfig, Settings}; - -const NEXTJS_INTEGRATION_ID: &str = "nextjs"; - -#[derive(Debug, Clone, Deserialize, Serialize, Validate)] -pub struct NextJsIntegrationConfig { - #[serde(default = "default_enabled")] - pub enabled: bool, - #[serde( - default = "default_rewrite_attributes", - deserialize_with = "crate::settings::vec_from_seq_or_map" - )] - #[validate(length(min = 1))] - pub rewrite_attributes: Vec, -} - -impl IntegrationConfig for NextJsIntegrationConfig { - fn is_enabled(&self) -> bool { - self.enabled - } -} - -fn default_enabled() -> bool { - false -} - -fn default_rewrite_attributes() -> Vec { - vec!["href".to_string(), "link".to_string(), "url".to_string()] -} - -pub fn register(settings: &Settings) -> Option { - let config = build(settings)?; - let structured = Arc::new(NextJsScriptRewriter::new( - Arc::clone(&config), - NextJsRewriteMode::Structured, - )); - let streamed = Arc::new(NextJsScriptRewriter::new( - config, - NextJsRewriteMode::Streamed, - )); - - Some( - IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) - .with_script_rewriter(structured) - .with_script_rewriter(streamed) - .build(), - ) -} - -fn build(settings: &Settings) -> Option> { - let config = settings - .integration_config::(NEXTJS_INTEGRATION_ID) - .ok() - .flatten()?; - Some(Arc::new(config)) -} - -#[derive(Clone, Copy)] -enum NextJsRewriteMode { - Structured, - Streamed, -} - -struct NextJsScriptRewriter { - config: Arc, - mode: NextJsRewriteMode, -} - -impl NextJsScriptRewriter { - fn new(config: Arc, mode: NextJsRewriteMode) -> Self { - Self { config, mode } - } - - fn rewrite_values( - &self, - content: &str, - ctx: &IntegrationScriptContext<'_>, - ) -> ScriptRewriteAction { - if let Some(rewritten) = rewrite_nextjs_values( - content, - ctx.origin_host, - ctx.request_host, - ctx.request_scheme, - &self.config.rewrite_attributes, - ) { - ScriptRewriteAction::replace(rewritten) - } else { - ScriptRewriteAction::keep() - } - } -} - -impl IntegrationScriptRewriter for NextJsScriptRewriter { - fn integration_id(&self) -> &'static str { - NEXTJS_INTEGRATION_ID - } - - fn selector(&self) -> &'static str { - match self.mode { - NextJsRewriteMode::Structured => "script#__NEXT_DATA__", - NextJsRewriteMode::Streamed => "script", - } - } - - fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { - if self.config.rewrite_attributes.is_empty() { - return ScriptRewriteAction::keep(); - } - - match self.mode { - NextJsRewriteMode::Structured => self.rewrite_values(content, ctx), - NextJsRewriteMode::Streamed => { - if !content.contains("self.__next_f") { - return ScriptRewriteAction::keep(); - } - self.rewrite_values(content, ctx) - } - } - } -} - -fn rewrite_nextjs_values( - content: &str, - origin_host: &str, - request_host: &str, - request_scheme: &str, - attributes: &[String], -) -> Option { - if origin_host.is_empty() || request_host.is_empty() || attributes.is_empty() { - return None; - } - - let mut rewritten = content.to_string(); - let mut changed = false; - let escaped_origin = escape(origin_host); - let replacement_scheme = format!("{}://{}", request_scheme, request_host); - - for attribute in attributes { - let escaped_attr = escape(attribute); - let pattern = format!( - r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*")(?Phttps?://|//){origin}"#, - attr = escaped_attr, - origin = escaped_origin, - ); - let regex = Regex::new(&pattern).expect("valid Next.js rewrite regex"); - let next_value = regex.replace_all(&rewritten, |caps: ®ex::Captures<'_>| { - let scheme = &caps["scheme"]; - let replacement = if scheme == "//" { - format!("//{}", request_host) - } else { - replacement_scheme.clone() - }; - format!("{}{}", &caps["prefix"], replacement) - }); - if next_value != rewritten { - changed = true; - rewritten = next_value.into_owned(); - } - } - - changed.then_some(rewritten) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; - use crate::integrations::{IntegrationRegistry, IntegrationScriptContext, ScriptRewriteAction}; - use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; - use crate::test_support::tests::create_test_settings; - use serde_json::json; - use std::io::Cursor; - - fn test_config() -> Arc { - Arc::new(NextJsIntegrationConfig { - enabled: true, - rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], - }) - } - - fn ctx(selector: &'static str) -> IntegrationScriptContext<'static> { - IntegrationScriptContext { - selector, - request_host: "ts.example.com", - request_scheme: "https", - origin_host: "origin.example.com", - } - } - - #[test] - fn structured_rewriter_updates_next_data_payload() { - let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Structured); - let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__")); - - match result { - ScriptRewriteAction::Replace(value) => { - assert!(value.contains(r#""href":"https://ts.example.com/reviews""#)); - assert!(value.contains(r#""href":"https://ts.example.com/sign-in""#)); - assert!(value.contains(r#""fallbackHref":"http://origin.example.com/legacy""#)); - assert!(value.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#)); - } - _ => panic!("Expected rewrite to update payload"), - } - } - - #[test] - fn streamed_rewriter_only_runs_for_next_payloads() { - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Streamed); - - let noop = rewriter.rewrite("console.log('hello');", &ctx("script")); - assert!(matches!(noop, ScriptRewriteAction::Keep)); - - let payload = r#"self.__next_f.push(["chunk", "{\"href\":\"https://origin.example.com/app\"}"]); - "#; - let rewritten = rewriter.rewrite(payload, &ctx("script")); - match rewritten { - ScriptRewriteAction::Replace(value) => { - assert!(value.contains(r#"https://ts.example.com/app"#)); - } - _ => panic!("Expected streamed payload rewrite"), - } - } - - #[test] - fn rewrite_helper_handles_protocol_relative_urls() { - let content = r#"{"props":{"pageProps":{"link":"//origin.example.com/image.png"}}}"#; - let rewritten = rewrite_nextjs_values( - content, - "origin.example.com", - "ts.example.com", - "https", - &["link".into()], - ) - .expect("should rewrite protocol relative link"); - - assert!(rewritten.contains(r#""link":"//ts.example.com/image.png""#)); - } - - fn config_from_settings( - settings: &Settings, - registry: &IntegrationRegistry, - ) -> HtmlProcessorConfig { - HtmlProcessorConfig::from_settings( - settings, - registry, - "origin.example.com", - "test.example.com", - "https", - ) - } - - #[test] - fn html_processor_rewrites_nextjs_script_when_enabled() { - let html = r#" - - "#; - - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["href", "link", "url"], - }), - ) - .expect("should update nextjs config"); - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 8192, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); - - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - let processed = String::from_utf8_lossy(&output); - - assert!( - processed.contains(r#""href":"https://test.example.com/reviews""#), - "should rewrite https Next.js href values" - ); - assert!( - processed.contains(r#""href":"https://test.example.com/sign-in""#), - "should rewrite http Next.js href values" - ); - assert!( - processed.contains(r#""fallbackHref":"http://origin.example.com/legacy""#), - "should leave other fields untouched" - ); - assert!( - processed.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#), - "should not rewrite non-href keys" - ); - assert!( - !processed.contains("\"href\":\"https://origin.example.com/reviews\""), - "should remove origin https href" - ); - assert!( - !processed.contains("\"href\":\"http://origin.example.com/sign-in\""), - "should remove origin http href" - ); - } - - #[test] - fn html_processor_rewrites_nextjs_stream_payload() { - let html = r#" - - "#; - - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["href", "link", "url"], - }), - ) - .expect("should update nextjs config"); - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 8192, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); - - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - let processed = String::from_utf8_lossy(&output); - let normalized = processed.replace('\\', ""); - assert!( - normalized.contains("\"href\":\"https://test.example.com/dashboard\""), - "should rewrite escaped href sequences inside streamed payloads: {}", - normalized - ); - assert!( - normalized.contains("\"href\":\"https://test.example.com/secondary\""), - "should rewrite plain href attributes inside streamed payloads" - ); - assert!( - normalized.contains("\"link\":\"https://test.example.com/api-test\""), - "should rewrite additional configured attributes like link" - ); - assert!( - processed.contains("\"dataHost\":\"https://origin.example.com/api\""), - "should leave non-href properties untouched" - ); - } - - #[test] - fn register_respects_enabled_flag() { - let settings = create_test_settings(); - let registration = register(&settings); - - assert!( - registration.is_none(), - "should skip registration when integration is disabled" - ); - } -} diff --git a/crates/common/src/integrations/nextjs/fixtures/inlined-data-escaped.html b/crates/common/src/integrations/nextjs/fixtures/inlined-data-escaped.html new file mode 100644 index 0000000..81c213c --- /dev/null +++ b/crates/common/src/integrations/nextjs/fixtures/inlined-data-escaped.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/crates/common/src/integrations/nextjs/fixtures/inlined-data-nonce.html b/crates/common/src/integrations/nextjs/fixtures/inlined-data-nonce.html new file mode 100644 index 0000000..5ae7afa --- /dev/null +++ b/crates/common/src/integrations/nextjs/fixtures/inlined-data-nonce.html @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs new file mode 100644 index 0000000..f379872 --- /dev/null +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -0,0 +1,906 @@ +use std::cell::{Cell, RefCell}; +use std::rc::Rc; +use std::sync::Arc; +use std::sync::Mutex; + +use lol_html::{text, Settings as RewriterSettings}; + +use crate::integrations::{IntegrationHtmlContext, IntegrationHtmlPostProcessor}; + +use super::rsc::rewrite_rsc_scripts_combined_with_limit; +use super::rsc_placeholders::{ + NextJsRscPostProcessState, RSC_PAYLOAD_PLACEHOLDER_PREFIX, RSC_PAYLOAD_PLACEHOLDER_SUFFIX, +}; +use super::shared::find_rsc_push_payload_range; +use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; + +pub(crate) struct NextJsHtmlPostProcessor { + config: Arc, +} + +impl NextJsHtmlPostProcessor { + pub(crate) fn new(config: Arc) -> Self { + Self { config } + } +} + +impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { + fn integration_id(&self) -> &'static str { + NEXTJS_INTEGRATION_ID + } + + fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { + if !self.config.enabled || self.config.rewrite_attributes.is_empty() { + return false; + } + + // Check if we have captured placeholders from streaming + if let Some(state) = ctx + .document_state + .get::>(NEXTJS_INTEGRATION_ID) + { + let guard = state.lock().unwrap_or_else(|e| e.into_inner()); + if !guard.payloads.is_empty() { + return true; + } + } + + // Also check if HTML contains RSC scripts that weren't captured during streaming + // (e.g., fragmented scripts that we skipped during the streaming pass) + html.contains("__next_f.push") && html.contains(ctx.origin_host) + } + + fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool { + // Try to get payloads captured during streaming (placeholder approach) + let payloads = ctx + .document_state + .get::>(NEXTJS_INTEGRATION_ID) + .map(|state| { + let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); + guard.take_payloads() + }) + .unwrap_or_default(); + + if !payloads.is_empty() { + // Placeholder approach: substitute placeholders with rewritten payloads + return self.substitute_placeholders(html, ctx, payloads); + } + + // Fallback: re-parse HTML to find RSC scripts that weren't captured during streaming + // (e.g., fragmented scripts that we skipped during the streaming pass) + post_process_rsc_html_in_place_with_limit( + html, + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + self.config.max_combined_payload_bytes, + ) + } +} + +impl NextJsHtmlPostProcessor { + /// Substitute placeholders with rewritten payloads (fast path for unfragmented scripts). + fn substitute_placeholders( + &self, + html: &mut String, + ctx: &IntegrationHtmlContext<'_>, + payloads: Vec, + ) -> bool { + let payload_refs: Vec<&str> = payloads.iter().map(String::as_str).collect(); + let mut rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( + payload_refs.as_slice(), + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + self.config.max_combined_payload_bytes, + ); + + if rewritten_payloads.len() != payloads.len() { + log::warn!( + "NextJs post-process skipping due to rewrite payload count mismatch: original={}, rewritten={}", + payloads.len(), + rewritten_payloads.len() + ); + rewritten_payloads = payloads; + } + + if log::log_enabled!(log::Level::Debug) { + let origin_count_before: usize = rewritten_payloads + .iter() + .map(|p| p.matches(ctx.origin_host).count()) + .sum(); + log::debug!( + "NextJs post-processor substituting RSC payloads: scripts={}, origin_urls={}, origin={}, proxy={}://{}, html_len={}", + rewritten_payloads.len(), + origin_count_before, + ctx.origin_host, + ctx.request_scheme, + ctx.request_host, + html.len() + ); + } + + let (updated, replaced) = + substitute_rsc_payload_placeholders(html.as_str(), &rewritten_payloads); + + let expected = rewritten_payloads.len(); + if replaced != expected { + log::warn!( + "NextJs post-process placeholder substitution count mismatch: expected={}, replaced={}", + expected, + replaced + ); + } + + if contains_rsc_payload_placeholders(&updated) { + log::error!( + "NextJs post-process left RSC placeholders in output; attempting fallback substitution (scripts={})", + expected + ); + + let fallback = + substitute_rsc_payload_placeholders_exact(html.as_str(), &rewritten_payloads); + + if contains_rsc_payload_placeholders(&fallback) { + log::error!( + "NextJs post-process fallback substitution still left RSC placeholders in output; hydration may break (scripts={})", + expected + ); + } + + *html = fallback; + return true; + } + + *html = updated; + true + } +} + +fn contains_rsc_payload_placeholders(html: &str) -> bool { + let mut cursor = 0usize; + while let Some(next) = html[cursor..].find(RSC_PAYLOAD_PLACEHOLDER_PREFIX) { + let start = cursor + next; + let after_prefix = start + RSC_PAYLOAD_PLACEHOLDER_PREFIX.len(); + let mut idx_end = after_prefix; + while idx_end < html.len() && html.as_bytes()[idx_end].is_ascii_digit() { + idx_end += 1; + } + if idx_end > after_prefix && html[idx_end..].starts_with(RSC_PAYLOAD_PLACEHOLDER_SUFFIX) { + return true; + } + cursor = after_prefix; + } + false +} + +fn substitute_rsc_payload_placeholders(html: &str, replacements: &[String]) -> (String, usize) { + let mut output = String::with_capacity(html.len()); + let mut cursor = 0usize; + let mut replaced = 0usize; + + while let Some(next) = html[cursor..].find(RSC_PAYLOAD_PLACEHOLDER_PREFIX) { + let start = cursor + next; + output.push_str(&html[cursor..start]); + + let after_prefix = start + RSC_PAYLOAD_PLACEHOLDER_PREFIX.len(); + let mut idx_end = after_prefix; + while idx_end < html.len() && html.as_bytes()[idx_end].is_ascii_digit() { + idx_end += 1; + } + + let suffix_ok = + idx_end > after_prefix && html[idx_end..].starts_with(RSC_PAYLOAD_PLACEHOLDER_SUFFIX); + if !suffix_ok { + output.push_str(RSC_PAYLOAD_PLACEHOLDER_PREFIX); + cursor = after_prefix; + continue; + } + + let idx_str = &html[after_prefix..idx_end]; + let Ok(index) = idx_str.parse::() else { + output.push_str(RSC_PAYLOAD_PLACEHOLDER_PREFIX); + output.push_str(idx_str); + output.push_str(RSC_PAYLOAD_PLACEHOLDER_SUFFIX); + cursor = idx_end + RSC_PAYLOAD_PLACEHOLDER_SUFFIX.len(); + continue; + }; + + let Some(replacement) = replacements.get(index) else { + output.push_str(RSC_PAYLOAD_PLACEHOLDER_PREFIX); + output.push_str(idx_str); + output.push_str(RSC_PAYLOAD_PLACEHOLDER_SUFFIX); + cursor = idx_end + RSC_PAYLOAD_PLACEHOLDER_SUFFIX.len(); + continue; + }; + + output.push_str(replacement); + replaced += 1; + cursor = idx_end + RSC_PAYLOAD_PLACEHOLDER_SUFFIX.len(); + } + + output.push_str(&html[cursor..]); + (output, replaced) +} + +fn substitute_rsc_payload_placeholders_exact(html: &str, replacements: &[String]) -> String { + let mut out = html.to_string(); + for (index, replacement) in replacements.iter().enumerate() { + let placeholder = + format!("{RSC_PAYLOAD_PLACEHOLDER_PREFIX}{index}{RSC_PAYLOAD_PLACEHOLDER_SUFFIX}"); + out = out.replace(&placeholder, replacement); + } + out +} + +#[derive(Debug, Clone, Copy)] +struct RscPushScriptRange { + payload_start: usize, + payload_end: usize, +} + +fn find_rsc_push_scripts(html: &str) -> Vec { + if !html.contains("__next_f") { + return Vec::new(); + } + + let ranges: Rc>> = Rc::new(RefCell::new(Vec::new())); + let buffer: Rc> = Rc::new(RefCell::new(String::new())); + let buffering = Rc::new(Cell::new(false)); + let buffer_start = Rc::new(Cell::new(0usize)); + + let settings = RewriterSettings { + element_content_handlers: vec![text!("script", { + let ranges = Rc::clone(&ranges); + let buffer = Rc::clone(&buffer); + let buffering = Rc::clone(&buffering); + let buffer_start = Rc::clone(&buffer_start); + move |t| { + if !buffering.get() && t.last_in_text_node() { + let script = t.as_str(); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(script) + else { + return Ok(()); + }; + + let loc = t.source_location().bytes(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: loc.start + payload_start_rel, + payload_end: loc.start + payload_end_rel, + }); + return Ok(()); + } + + if !buffering.get() { + buffering.set(true); + buffer_start.set(t.source_location().bytes().start); + } + buffer.borrow_mut().push_str(t.as_str()); + + if !t.last_in_text_node() { + return Ok(()); + } + + buffering.set(false); + let script = std::mem::take(&mut *buffer.borrow_mut()); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(&script) + else { + return Ok(()); + }; + + let base = buffer_start.get(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: base + payload_start_rel, + payload_end: base + payload_end_rel, + }); + + Ok(()) + } + })], + ..RewriterSettings::default() + }; + + let mut rewriter = lol_html::HtmlRewriter::new(settings, |_chunk: &[u8]| {}); + if rewriter.write(html.as_bytes()).is_err() || rewriter.end().is_err() { + return Vec::new(); + } + + let result = std::mem::take(&mut *ranges.borrow_mut()); + result +} + +/// Rewrite RSC payload URLs in HTML by re-parsing the document. +/// +/// # Deprecation +/// +/// This function is **deprecated** in favor of the placeholder-based approach used in production: +/// - `NextJsRscPlaceholderRewriter` captures payloads during the initial `lol_html` pass +/// - `NextJsHtmlPostProcessor` rewrites and substitutes them at end-of-document +/// +/// This function re-parses HTML with `lol_html`, which is slower than the placeholder approach. +/// It remains available for testing and backward compatibility. +#[deprecated( + since = "0.1.0", + note = "Use NextJsHtmlPostProcessor for production RSC rewriting. This function re-parses HTML." +)] +pub fn post_process_rsc_html( + html: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> String { + let mut result = html.to_string(); + #[allow(deprecated)] + post_process_rsc_html_in_place(&mut result, origin_host, request_host, request_scheme); + result +} + +/// Rewrite RSC payload URLs in HTML in place by re-parsing the document. +/// +/// # Deprecation +/// +/// This function is **deprecated** in favor of the placeholder-based approach used in production. +/// See [`post_process_rsc_html`] for details. +#[deprecated( + since = "0.1.0", + note = "Use NextJsHtmlPostProcessor for production RSC rewriting. This function re-parses HTML." +)] +pub fn post_process_rsc_html_in_place( + html: &mut String, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> bool { + post_process_rsc_html_in_place_with_limit( + html, + origin_host, + request_host, + request_scheme, + super::rsc::DEFAULT_MAX_COMBINED_PAYLOAD_BYTES, + ) +} + +fn post_process_rsc_html_in_place_with_limit( + html: &mut String, + origin_host: &str, + request_host: &str, + request_scheme: &str, + max_combined_payload_bytes: usize, +) -> bool { + let mut scripts = find_rsc_push_scripts(html.as_str()); + if scripts.is_empty() { + return false; + } + + scripts.sort_by_key(|s| s.payload_start); + let mut previous_end = 0usize; + for script in &scripts { + if script.payload_start > script.payload_end { + log::warn!( + "NextJs post-process skipping due to invalid payload range: start={}, end={}", + script.payload_start, + script.payload_end + ); + return false; + } + if script.payload_end > html.len() + || !html.is_char_boundary(script.payload_start) + || !html.is_char_boundary(script.payload_end) + { + log::warn!( + "NextJs post-process skipping due to non-UTF8 boundary payload range: start={}, end={}, html_len={}", + script.payload_start, + script.payload_end, + html.len() + ); + return false; + } + if script.payload_start < previous_end { + log::warn!( + "NextJs post-process skipping due to overlapping payload ranges: prev_end={}, start={}, end={}", + previous_end, + script.payload_start, + script.payload_end + ); + return false; + } + previous_end = script.payload_end; + } + + let rewritten_payloads = { + let Some(payloads) = scripts + .iter() + .map(|s| html.get(s.payload_start..s.payload_end)) + .collect::>>() + else { + log::warn!( + "NextJs post-process skipping due to invalid UTF-8 payload slicing despite boundary checks" + ); + return false; + }; + + if !payloads.iter().any(|p| p.contains(origin_host)) { + return false; + } + + if log::log_enabled!(log::Level::Debug) { + let origin_count_before: usize = payloads + .iter() + .map(|p| p.matches(origin_host).count()) + .sum(); + log::debug!( + "post_process_rsc_html: {} scripts, {} origin URLs, origin={}, proxy={}://{}", + payloads.len(), + origin_count_before, + origin_host, + request_scheme, + request_host + ); + } + + let rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( + payloads.as_slice(), + origin_host, + request_host, + request_scheme, + max_combined_payload_bytes, + ); + + if rewritten_payloads.len() != payloads.len() { + log::warn!( + "NextJs post-process skipping due to rewrite payload count mismatch: original={}, rewritten={}", + payloads.len(), + rewritten_payloads.len() + ); + return false; + } + + let changed = payloads + .iter() + .zip(&rewritten_payloads) + .any(|(original, rewritten)| *original != rewritten); + if !changed { + return false; + } + + rewritten_payloads + }; + + for (i, script) in scripts.iter().enumerate().rev() { + html.replace_range( + script.payload_start..script.payload_end, + &rewritten_payloads[i], + ); + } + + true +} + +#[cfg(test)] +#[allow(deprecated)] // Tests use deprecated post_process_rsc_html for legacy API coverage +mod tests { + use super::*; + + fn find_rsc_push_scripts_chunked( + html: &str, + chunk_size: usize, + ) -> (Vec, bool) { + if !html.contains("__next_f") { + return (Vec::new(), false); + } + + let ranges: Rc>> = Rc::new(RefCell::new(Vec::new())); + let buffer: Rc> = Rc::new(RefCell::new(String::new())); + let buffering = Rc::new(Cell::new(false)); + let buffer_start = Rc::new(Cell::new(0usize)); + let saw_partial = Rc::new(Cell::new(false)); + + let settings = RewriterSettings { + element_content_handlers: vec![text!("script", { + let ranges = Rc::clone(&ranges); + let buffer = Rc::clone(&buffer); + let buffering = Rc::clone(&buffering); + let buffer_start = Rc::clone(&buffer_start); + let saw_partial = Rc::clone(&saw_partial); + move |t| { + if !t.last_in_text_node() { + saw_partial.set(true); + } + + if !buffering.get() && t.last_in_text_node() { + let script = t.as_str(); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(script) + else { + return Ok(()); + }; + + let loc = t.source_location().bytes(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: loc.start + payload_start_rel, + payload_end: loc.start + payload_end_rel, + }); + return Ok(()); + } + + if !buffering.get() { + buffering.set(true); + buffer_start.set(t.source_location().bytes().start); + } + buffer.borrow_mut().push_str(t.as_str()); + + if !t.last_in_text_node() { + return Ok(()); + } + + buffering.set(false); + let script = std::mem::take(&mut *buffer.borrow_mut()); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(&script) + else { + return Ok(()); + }; + + let base = buffer_start.get(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: base + payload_start_rel, + payload_end: base + payload_end_rel, + }); + + Ok(()) + } + })], + ..RewriterSettings::default() + }; + + let mut rewriter = lol_html::HtmlRewriter::new(settings, |_chunk: &[u8]| {}); + let chunk_size = chunk_size.max(1); + for chunk in html.as_bytes().chunks(chunk_size) { + if rewriter.write(chunk).is_err() { + return (Vec::new(), saw_partial.get()); + } + } + if rewriter.end().is_err() { + return (Vec::new(), saw_partial.get()); + } + + let result = std::mem::take(&mut *ranges.borrow_mut()); + (result, saw_partial.get()) + } + + #[test] + fn post_process_rsc_html_rewrites_cross_script_tchunks() { + let html = r#" + + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + assert!( + result.contains("test.example.com/page"), + "URL should be rewritten. Got: {}", + result + ); + assert!( + result.contains(":T3c,"), + "T-chunk length should be updated. Got: {}", + result + ); + assert!(result.contains("") && result.contains("")); + assert!(result.contains("self.__next_f.push")); + } + + #[test] + fn finds_rsc_push_scripts_with_fragmented_script_text_chunks() { + let filler = "a".repeat(32 * 1024); + let payload = format!("{filler} https://origin.example.com/page"); + let html = format!( + r#""# + ); + + let (scripts, saw_partial) = find_rsc_push_scripts_chunked(&html, 64); + + assert!( + saw_partial, + "should observe fragmented script text chunks when writing input in small pieces" + ); + assert_eq!( + scripts.len(), + 1, + "Should find exactly one RSC payload script" + ); + + let extracted = &html[scripts[0].payload_start..scripts[0].payload_end]; + assert_eq!( + extracted.len(), + payload.len(), + "Extracted payload length should match the original payload" + ); + assert!( + extracted.ends_with("https://origin.example.com/page"), + "Extracted payload should contain the origin URL" + ); + } + + #[test] + fn finds_assignment_push_form() { + let html = r#""#; + let scripts = find_rsc_push_scripts(html); + assert_eq!( + scripts.len(), + 1, + "Should find exactly one RSC payload script" + ); + let payload = &html[scripts[0].payload_start..scripts[0].payload_end]; + assert_eq!(payload, "payload", "Should capture the payload string"); + } + + #[test] + fn finds_window_next_f_push_with_case_insensitive_script_tags() { + let html = r#""#; + let scripts = find_rsc_push_scripts(html); + assert_eq!( + scripts.len(), + 1, + "Should find exactly one RSC payload script" + ); + let payload = &html[scripts[0].payload_start..scripts[0].payload_end]; + assert_eq!(payload, "payload", "Should capture the payload string"); + } + + #[test] + fn post_process_rsc_html_handles_prettified_format() { + let html = r#" + + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + assert!( + result.contains("test.example.com/news"), + "First URL should be rewritten. Got: {}", + result + ); + assert!( + result.contains("test.example.com/reviews"), + "Second URL should be rewritten. Got: {}", + result + ); + assert!( + !result.contains("origin.example.com"), + "No origin URLs should remain. Got: {}", + result + ); + assert!(result.contains("") && result.contains("")); + assert!(result.contains("self.__next_f.push")); + } + + #[test] + fn post_process_rewrites_html_href_inside_tchunk() { + fn calculate_unescaped_byte_length_for_test(s: &str) -> usize { + let bytes = s.as_bytes(); + let mut pos = 0usize; + let mut count = 0usize; + + while pos < bytes.len() { + if bytes[pos] == b'\\' && pos + 1 < bytes.len() { + let esc = bytes[pos + 1]; + + if matches!( + esc, + b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' + ) { + pos += 2; + count += 1; + continue; + } + + if esc == b'x' && pos + 3 < bytes.len() { + pos += 4; + count += 1; + continue; + } + + if esc == b'u' && pos + 5 < bytes.len() { + let hex = &s[pos + 2..pos + 6]; + if hex.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + // Surrogate pairs use UTF-16 and expand to 4 bytes in UTF-8. + if (0xD800..=0xDBFF).contains(&code_unit) + && pos + 11 < bytes.len() + && bytes[pos + 6] == b'\\' + && bytes[pos + 7] == b'u' + { + let hex2 = &s[pos + 8..pos + 12]; + if hex2.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { + if (0xDC00..=0xDFFF).contains(&code_unit2) { + pos += 12; + count += 4; + continue; + } + } + } + } + + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + pos += 6; + count += c.len_utf8(); + continue; + } + } + } + } + + if bytes[pos] < 0x80 { + pos += 1; + count += 1; + } else { + let c = s[pos..].chars().next().unwrap_or('\u{FFFD}'); + pos += c.len_utf8(); + count += c.len_utf8(); + } + } + + count + } + + let tchunk_content = r#"\u003cdiv\u003e\u003ca href="https://origin.example.com/about-us"\u003eAbout\u003c/a\u003e\u003c/div\u003e"#; + let declared_len_hex = format!( + "{:x}", + calculate_unescaped_byte_length_for_test(tchunk_content) + ); + let html = format!( + r#" + +"# + ); + + let result = + post_process_rsc_html(&html, "origin.example.com", "test.example.com", "https"); + + assert!( + result.contains("test.example.com/about-us"), + "HTML href URL in T-chunk should be rewritten. Got: {}", + result + ); + assert!( + !result.contains("origin.example.com"), + "No origin URLs should remain. Got: {}", + result + ); + assert!( + !result.contains(&format!(":T{declared_len_hex},")), + "T-chunk length should have been recalculated. Got: {}", + result + ); + } + + #[test] + fn handles_nextjs_inlined_data_nonce_fixture() { + // Fixture mirrors Next.js `createInlinedDataReadableStream` output: + // `` + let html = include_str!("fixtures/inlined-data-nonce.html"); + let scripts = find_rsc_push_scripts(html); + assert_eq!(scripts.len(), 1, "Should find exactly one RSC data script"); + + let rewritten = + post_process_rsc_html(html, "origin.example.com", "proxy.example.com", "https"); + assert!( + rewritten.contains("https://proxy.example.com/news"), + "Fixture URL should be rewritten. Got: {rewritten}" + ); + assert!( + !rewritten.contains("https://origin.example.com/news"), + "Origin URL should be removed. Got: {rewritten}" + ); + } + + #[test] + fn handles_nextjs_inlined_data_html_escaping_fixture() { + // Fixture includes `\\u003c` escapes, matching Next.js `htmlEscapeJsonString` behavior. + let html = include_str!("fixtures/inlined-data-escaped.html"); + let scripts = find_rsc_push_scripts(html); + assert_eq!(scripts.len(), 1, "Should find exactly one RSC data script"); + + let rewritten = + post_process_rsc_html(html, "origin.example.com", "proxy.example.com", "https"); + assert!( + rewritten.contains("https://proxy.example.com/about"), + "Escaped fixture URL should be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#"\\u003ca href=\\\"https://proxy.example.com/about\\\""#), + "Escaped HTML should remain escaped and rewritten. Got: {rewritten}" + ); + assert!( + !rewritten.contains("https://origin.example.com/about"), + "Origin URL should be removed. Got: {rewritten}" + ); + } + + #[test] + fn handles_trailing_backslash_gracefully() { + // Malformed content with trailing backslash should not panic + let html = r#" + + +"#; + + let scripts = find_rsc_push_scripts(html); + // The first script is malformed (trailing backslash escapes the quote), + // so it won't be detected as valid. The second one should be found. + assert!( + !scripts.is_empty(), + "Should find at least the valid script. Found: {}", + scripts.len() + ); + + // Should not panic during processing + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + assert!( + result.contains("test.example.com") || result.contains("origin.example.com"), + "Processing should complete without panic" + ); + } + + #[test] + fn handles_unterminated_string_gracefully() { + // Content where string never closes - should not hang or panic + let html = r#" + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + assert_eq!(result, html, "HTML without origin should be unchanged"); + } +} diff --git a/crates/common/src/integrations/nextjs/mod.rs b/crates/common/src/integrations/nextjs/mod.rs new file mode 100644 index 0000000..d79e98c --- /dev/null +++ b/crates/common/src/integrations/nextjs/mod.rs @@ -0,0 +1,491 @@ +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use validator::Validate; + +use crate::integrations::IntegrationRegistration; +use crate::settings::{IntegrationConfig, Settings}; + +const NEXTJS_INTEGRATION_ID: &str = "nextjs"; + +mod html_post_process; +mod rsc; +mod rsc_placeholders; +mod script_rewriter; +mod shared; + +// Re-export deprecated legacy functions for backward compatibility. +// Production code should use the placeholder-based approach via NextJsHtmlPostProcessor. +#[allow(deprecated)] +pub use html_post_process::{post_process_rsc_html, post_process_rsc_html_in_place}; +pub use rsc::rewrite_rsc_scripts_combined; + +use html_post_process::NextJsHtmlPostProcessor; +use rsc_placeholders::NextJsRscPlaceholderRewriter; +use script_rewriter::NextJsNextDataRewriter; + +#[derive(Debug, Clone, Deserialize, Serialize, Validate)] +pub struct NextJsIntegrationConfig { + #[serde(default = "default_enabled")] + pub enabled: bool, + #[serde( + default = "default_rewrite_attributes", + deserialize_with = "crate::settings::vec_from_seq_or_map" + )] + #[validate(length(min = 1))] + pub rewrite_attributes: Vec, + #[serde(default = "default_max_combined_payload_bytes")] + pub max_combined_payload_bytes: usize, +} + +impl IntegrationConfig for NextJsIntegrationConfig { + fn is_enabled(&self) -> bool { + self.enabled + } +} + +fn default_enabled() -> bool { + false +} + +fn default_rewrite_attributes() -> Vec { + vec!["href".to_string(), "link".to_string(), "url".to_string()] +} + +fn default_max_combined_payload_bytes() -> usize { + 10 * 1024 * 1024 +} + +pub fn register(settings: &Settings) -> Option { + let config = match build(settings) { + Some(config) => { + log::info!( + "NextJS integration registered: enabled={}, rewrite_attributes={:?}, max_combined_payload_bytes={}", + config.enabled, + config.rewrite_attributes, + config.max_combined_payload_bytes + ); + config + } + None => { + log::info!("NextJS integration not registered (disabled or missing config)"); + return None; + } + }; + + // Register a structured (Pages Router __NEXT_DATA__) rewriter. + let structured = Arc::new(NextJsNextDataRewriter::new(config.clone())); + + // Insert placeholders for App Router RSC payload scripts during the initial HTML rewrite pass, + // then substitute them during post-processing without re-parsing HTML. + let placeholders = Arc::new(NextJsRscPlaceholderRewriter::new(config.clone())); + + // Register post-processor for cross-script RSC T-chunks + let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config.clone())); + + let builder = IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) + .with_script_rewriter(structured) + .with_script_rewriter(placeholders) + .with_html_post_processor(post_processor); + + Some(builder.build()) +} + +fn build(settings: &Settings) -> Option> { + let config = settings + .integration_config::(NEXTJS_INTEGRATION_ID) + .ok() + .flatten()?; + Some(Arc::new(config)) +} + +#[cfg(test)] +mod tests { + use super::rsc_placeholders::RSC_PAYLOAD_PLACEHOLDER_PREFIX; + use super::*; + use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; + use crate::integrations::IntegrationRegistry; + use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; + use crate::test_support::tests::create_test_settings; + use serde_json::json; + use std::io::Cursor; + + fn config_from_settings( + settings: &Settings, + registry: &IntegrationRegistry, + ) -> HtmlProcessorConfig { + HtmlProcessorConfig::from_settings( + settings, + registry, + "origin.example.com", + "test.example.com", + "https", + ) + } + + #[test] + fn html_processor_rewrites_nextjs_script_when_enabled() { + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let processed = String::from_utf8_lossy(&output); + + // Note: URLs may have padding characters for length preservation + assert!( + processed.contains("test.example.com") && processed.contains("/reviews"), + "should rewrite https Next.js href values to test.example.com" + ); + assert!( + processed.contains("test.example.com") && processed.contains("/sign-in"), + "should rewrite http Next.js href values to test.example.com" + ); + assert!( + processed.contains(r#""fallbackHref":"http://origin.example.com/legacy""#), + "should leave other fields untouched" + ); + assert!( + processed.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#), + "should not rewrite non-href keys" + ); + assert!( + !processed.contains("\"href\":\"https://origin.example.com/reviews\""), + "should remove origin https href" + ); + assert!( + !processed.contains("\"href\":\"http://origin.example.com/sign-in\""), + "should remove origin http href" + ); + } + + #[test] + fn html_processor_rewrites_rsc_stream_payload_with_length_preservation() { + // RSC payloads (self.__next_f.push) are rewritten via post-processing. + // The streaming phase skips RSC push scripts, and the HTML post-processor handles them + // at end-of-document to correctly handle cross-script T-chunks. + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + let final_html = String::from_utf8_lossy(&output); + + // RSC payloads should be rewritten via end-of-document post-processing + assert!( + final_html.contains("test.example.com"), + "RSC stream payloads should be rewritten to proxy host via post-processing. Output: {}", + final_html + ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); + } + + #[test] + fn html_processor_rewrites_rsc_stream_payload_with_chunked_input() { + // RSC payloads are rewritten via post-processing, even with chunked streaming input + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 32, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + let final_html = String::from_utf8_lossy(&output); + + // RSC payloads should be rewritten via end-of-document post-processing + assert!( + final_html.contains("test.example.com"), + "RSC stream payloads should be rewritten to proxy host with chunked input. Output: {}", + final_html + ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); + } + + #[test] + fn html_processor_respects_max_combined_payload_bytes() { + // When the combined payload size exceeds `max_combined_payload_bytes` and the document + // contains cross-script T-chunks, we skip post-processing to avoid breaking hydration. + let html = r#" + + +"#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + "max_combined_payload_bytes": 1, + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + let final_html = String::from_utf8_lossy(&output); + + assert!( + final_html.contains("https://origin.example.com/page"), + "Origin URL should remain when rewrite is skipped due to size limit. Output: {}", + final_html + ); + assert!( + !final_html.contains("test.example.com"), + "Proxy host should not be introduced when rewrite is skipped. Output: {}", + final_html + ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); + } + + #[test] + fn register_respects_enabled_flag() { + let settings = create_test_settings(); + let registration = register(&settings); + + assert!( + registration.is_none(), + "should skip registration when integration is disabled" + ); + } + + #[test] + fn html_processor_rewrites_rsc_payloads_with_length_preservation() { + // RSC payloads (self.__next_f.push) are rewritten via post-processing. + // This allows navigation to stay on proxy while correctly handling cross-script T-chunks. + + let html = r#" + +"#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["url"], + }), + ) + .expect("should update nextjs config"); + + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let final_html = String::from_utf8_lossy(&output); + + // RSC payloads should be rewritten via post-processing + assert!( + final_html.contains("test.example.com"), + "RSC payload URLs should be rewritten to proxy host. Output: {}", + final_html + ); + + // Verify the RSC payload structure is preserved + assert!( + final_html.contains(r#""ID":879000"#), + "RSC payload ID should be preserved" + ); + assert!( + final_html.contains(r#""title":"Makes""#), + "RSC payload title should be preserved" + ); + assert!( + final_html.contains(r#""children":"$45a""#), + "RSC payload children reference should be preserved" + ); + + // Verify \n separators are preserved (crucial for RSC parsing) + assert!( + final_html.contains(r#"\n442:"#), + "RSC record separator \\n should be preserved. Output: {}", + final_html + ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); + } + + #[test] + fn html_processor_preserves_non_rsc_scripts_with_chunked_streaming() { + // Regression test: ensure non-RSC scripts are preserved when streamed alongside RSC scripts. + // With small chunk sizes, scripts get fragmented and the buffering logic must correctly + // handle non-RSC scripts without corrupting them. + let html = r#" + + + +"#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + // Use small chunk size to force fragmentation + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 16, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let final_html = String::from_utf8_lossy(&output); + + // Non-RSC scripts should be preserved + assert!( + final_html.contains(r#"console.log("hello world");"#), + "First non-RSC script should be preserved intact. Output: {}", + final_html + ); + assert!( + final_html.contains("window.analytics"), + "Third non-RSC script should be preserved. Output: {}", + final_html + ); + assert!( + final_html.contains("track: function(e)"), + "Third non-RSC script content should be intact. Output: {}", + final_html + ); + + // RSC scripts should be rewritten + assert!( + final_html.contains("test.example.com"), + "RSC URL should be rewritten. Output: {}", + final_html + ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "No placeholders should remain. Output: {}", + final_html + ); + } +} diff --git a/crates/common/src/integrations/nextjs/rsc.rs b/crates/common/src/integrations/nextjs/rsc.rs new file mode 100644 index 0000000..534c912 --- /dev/null +++ b/crates/common/src/integrations/nextjs/rsc.rs @@ -0,0 +1,822 @@ +use once_cell::sync::Lazy; +use regex::Regex; + +use super::shared::RscUrlRewriter; + +/// T-chunk header pattern: hex_id:Thex_length, +static TCHUNK_PATTERN: Lazy = + Lazy::new(|| Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").expect("valid T-chunk regex")); + +/// Marker used to track script boundaries when combining RSC content. +pub(crate) const RSC_MARKER: &str = "\x00SPLIT\x00"; + +/// Default maximum combined payload size for cross-script processing (10 MB). +pub(crate) const DEFAULT_MAX_COMBINED_PAYLOAD_BYTES: usize = 10 * 1024 * 1024; + +/// Maximum reasonable T-chunk length to prevent DoS from malformed input (100 MB). +/// A T-chunk larger than this is almost certainly malformed and would cause excessive +/// memory allocation or iteration. +const MAX_REASONABLE_TCHUNK_LENGTH: usize = 100 * 1024 * 1024; + +// ============================================================================= +// Escape Sequence Parsing +// ============================================================================= +// +// JS escape sequences are parsed by a shared iterator to avoid code duplication. +// The iterator yields (source_len, unescaped_byte_count) for each logical unit. + +/// A single parsed element from a JS string. +#[derive(Clone, Copy)] +struct EscapeElement { + /// Number of unescaped bytes this represents. + byte_count: usize, +} + +/// Iterator over escape sequences in a JS string. +/// Yields the unescaped byte count for each element. +struct EscapeSequenceIter<'a> { + bytes: &'a [u8], + str_ref: &'a str, + pos: usize, + skip_marker: Option<&'a [u8]>, +} + +impl<'a> EscapeSequenceIter<'a> { + fn new(s: &'a str) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: 0, + skip_marker: None, + } + } + + fn with_marker(s: &'a str, marker: &'a [u8]) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: 0, + skip_marker: Some(marker), + } + } + + fn from_position(s: &'a str, start: usize) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: start, + skip_marker: None, + } + } + + fn from_position_with_marker(s: &'a str, start: usize, marker: &'a [u8]) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: start, + skip_marker: Some(marker), + } + } + + /// Current position in the source string. + fn position(&self) -> usize { + self.pos + } +} + +impl Iterator for EscapeSequenceIter<'_> { + type Item = EscapeElement; + + fn next(&mut self) -> Option { + if self.pos >= self.bytes.len() { + return None; + } + + if let Some(marker) = self.skip_marker { + if self.pos + marker.len() <= self.bytes.len() + && &self.bytes[self.pos..self.pos + marker.len()] == marker + { + self.pos += marker.len(); + return Some(EscapeElement { byte_count: 0 }); + } + } + + if self.bytes[self.pos] == b'\\' && self.pos + 1 < self.bytes.len() { + let esc = self.bytes[self.pos + 1]; + + if matches!( + esc, + b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' + ) { + self.pos += 2; + return Some(EscapeElement { byte_count: 1 }); + } + + if esc == b'x' && self.pos + 3 < self.bytes.len() { + self.pos += 4; + return Some(EscapeElement { byte_count: 1 }); + } + + if esc == b'u' && self.pos + 5 < self.bytes.len() { + let hex = &self.str_ref[self.pos + 2..self.pos + 6]; + if hex.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + if (0xD800..=0xDBFF).contains(&code_unit) + && self.pos + 11 < self.bytes.len() + && self.bytes[self.pos + 6] == b'\\' + && self.bytes[self.pos + 7] == b'u' + { + let hex2 = &self.str_ref[self.pos + 8..self.pos + 12]; + if hex2.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { + if (0xDC00..=0xDFFF).contains(&code_unit2) { + self.pos += 12; + return Some(EscapeElement { byte_count: 4 }); + } + } + } + } + + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + self.pos += 6; + return Some(EscapeElement { + byte_count: c.len_utf8(), + }); + } + } + } + } + + if self.bytes[self.pos] < 0x80 { + self.pos += 1; + Some(EscapeElement { byte_count: 1 }) + } else { + let c = self.str_ref[self.pos..] + .chars() + .next() + .unwrap_or('\u{FFFD}'); + let len = c.len_utf8(); + self.pos += len; + Some(EscapeElement { byte_count: len }) + } + } +} + +/// Calculate the unescaped byte length of a JS string with escape sequences. +fn calculate_unescaped_byte_length(s: &str) -> usize { + EscapeSequenceIter::new(s).map(|e| e.byte_count).sum() +} + +/// Consume a specified number of unescaped bytes from a JS string, returning the end position. +fn consume_unescaped_bytes(s: &str, start_pos: usize, byte_count: usize) -> (usize, usize) { + let mut iter = EscapeSequenceIter::from_position(s, start_pos); + let mut consumed = 0; + + while consumed < byte_count { + match iter.next() { + Some(elem) => consumed += elem.byte_count, + None => break, + } + } + + (iter.position(), consumed) +} + +// ============================================================================= +// T-chunk discovery +// ============================================================================= + +/// Information about a T-chunk found in the combined RSC content. +struct TChunkInfo { + /// Position where the T-chunk header starts (e.g., position of "1a:T..."). + match_start: usize, + /// Position right after the chunk ID (position of ":T"). + id_end: usize, + /// Position right after the comma (where content begins). + header_end: usize, + /// Position where the content ends. + content_end: usize, +} + +/// Find all T-chunks in content, optionally skipping markers. +fn find_tchunks_impl(content: &str, skip_markers: bool) -> Option> { + let mut chunks = Vec::new(); + let mut search_pos = 0; + let marker = if skip_markers { + Some(RSC_MARKER.as_bytes()) + } else { + None + }; + + while search_pos < content.len() { + if let Some(cap) = TCHUNK_PATTERN.captures(&content[search_pos..]) { + let m = cap.get(0).expect("T-chunk match should exist"); + let match_start = search_pos + m.start(); + let header_end = search_pos + m.end(); + + let id_match = cap.get(1).expect("T-chunk id should exist"); + let id_end = search_pos + id_match.end(); + let length_hex = cap.get(2).expect("T-chunk length should exist").as_str(); + let declared_length = usize::from_str_radix(length_hex, 16) + .ok() + .filter(|&len| len <= MAX_REASONABLE_TCHUNK_LENGTH)?; + + let content_end = if let Some(marker_bytes) = marker { + let mut iter = EscapeSequenceIter::from_position_with_marker( + content, + header_end, + marker_bytes, + ); + let mut consumed = 0; + while consumed < declared_length { + match iter.next() { + Some(elem) => consumed += elem.byte_count, + None => break, + } + } + if consumed < declared_length { + return None; + } + iter.position() + } else { + let (pos, consumed) = consume_unescaped_bytes(content, header_end, declared_length); + if consumed < declared_length { + return None; + } + pos + }; + + chunks.push(TChunkInfo { + match_start, + id_end, + header_end, + content_end, + }); + + search_pos = content_end; + } else { + break; + } + } + + Some(chunks) +} + +fn find_tchunks(content: &str) -> Option> { + find_tchunks_impl(content, false) +} + +fn find_tchunks_with_markers(content: &str) -> Option> { + find_tchunks_impl(content, true) +} + +// ============================================================================= +// Single-script T-chunk processing +// ============================================================================= + +pub(crate) fn rewrite_rsc_tchunks_with_rewriter( + content: &str, + rewriter: &RscUrlRewriter, +) -> String { + let Some(chunks) = find_tchunks(content) else { + log::warn!( + "RSC payload contains invalid or incomplete T-chunks; skipping rewriting to avoid breaking hydration" + ); + return content.to_string(); + }; + + if chunks.is_empty() { + return rewriter.rewrite_to_string(content); + } + + let mut result = String::with_capacity(content.len()); + let mut last_end = 0; + + for chunk in &chunks { + let before = &content[last_end..chunk.match_start]; + result.push_str(rewriter.rewrite(before).as_ref()); + + let chunk_content = &content[chunk.header_end..chunk.content_end]; + let rewritten_content = rewriter.rewrite_to_string(chunk_content); + + let new_length = calculate_unescaped_byte_length(&rewritten_content); + let new_length_hex = format!("{new_length:x}"); + + result.push_str(&content[chunk.match_start..chunk.id_end]); + result.push_str(":T"); + result.push_str(&new_length_hex); + result.push(','); + result.push_str(&rewritten_content); + + last_end = chunk.content_end; + } + + let remaining = &content[last_end..]; + result.push_str(rewriter.rewrite(remaining).as_ref()); + + result +} + +// ============================================================================= +// Cross-script RSC processing +// ============================================================================= + +fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { + EscapeSequenceIter::with_marker(s, RSC_MARKER.as_bytes()) + .map(|e| e.byte_count) + .sum() +} + +/// Process multiple RSC script payloads together, handling cross-script T-chunks. +pub fn rewrite_rsc_scripts_combined( + payloads: &[&str], + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> Vec { + rewrite_rsc_scripts_combined_with_limit( + payloads, + origin_host, + request_host, + request_scheme, + DEFAULT_MAX_COMBINED_PAYLOAD_BYTES, + ) +} + +fn payload_contains_incomplete_tchunk(payload: &str) -> bool { + let mut search_pos = 0; + while search_pos < payload.len() { + let Some(cap) = TCHUNK_PATTERN.captures(&payload[search_pos..]) else { + break; + }; + + let m = cap.get(0).expect("T-chunk match should exist"); + let header_end = search_pos + m.end(); + + let length_hex = cap.get(2).expect("T-chunk length should exist").as_str(); + let Some(declared_length) = usize::from_str_radix(length_hex, 16) + .ok() + .filter(|&len| len <= MAX_REASONABLE_TCHUNK_LENGTH) + else { + return true; + }; + + let (pos, consumed) = consume_unescaped_bytes(payload, header_end, declared_length); + if consumed < declared_length { + return true; + } + + search_pos = pos; + } + + false +} + +pub(crate) fn rewrite_rsc_scripts_combined_with_limit( + payloads: &[&str], + origin_host: &str, + request_host: &str, + request_scheme: &str, + max_combined_payload_bytes: usize, +) -> Vec { + if payloads.is_empty() { + return Vec::new(); + } + + // Early exit if no payload contains the origin host - avoids regex compilation + if !payloads.iter().any(|p| p.contains(origin_host)) { + return payloads.iter().map(|p| (*p).to_string()).collect(); + } + + let rewriter = RscUrlRewriter::new(origin_host, request_host, request_scheme); + + if payloads.len() == 1 { + return vec![rewrite_rsc_tchunks_with_rewriter(payloads[0], &rewriter)]; + } + + let max_combined_payload_bytes = if max_combined_payload_bytes == 0 { + DEFAULT_MAX_COMBINED_PAYLOAD_BYTES + } else { + max_combined_payload_bytes + }; + + // Check total size before allocating combined buffer + let total_size: usize = + payloads.iter().map(|p| p.len()).sum::() + (payloads.len() - 1) * RSC_MARKER.len(); + + if total_size > max_combined_payload_bytes { + // Avoid allocating a large combined buffer. If the payloads contain cross-script T-chunks, + // per-script rewriting is unsafe because it may rewrite T-chunk content without updating + // the original header, breaking React hydration. + log::warn!( + "RSC combined payload size {} exceeds limit {}, skipping cross-script combining", + total_size, + max_combined_payload_bytes + ); + + if payloads + .iter() + .any(|p| payload_contains_incomplete_tchunk(p)) + { + log::warn!( + "RSC payloads contain cross-script T-chunks; skipping RSC URL rewriting to avoid breaking hydration (consider increasing integrations.nextjs.max_combined_payload_bytes)" + ); + return payloads.iter().map(|p| (*p).to_string()).collect(); + } + + return payloads + .iter() + .map(|p| rewrite_rsc_tchunks_with_rewriter(p, &rewriter)) + .collect(); + } + + let mut combined = String::with_capacity(total_size); + combined.push_str(payloads[0]); + for payload in &payloads[1..] { + combined.push_str(RSC_MARKER); + combined.push_str(payload); + } + + let Some(chunks) = find_tchunks_with_markers(&combined) else { + log::warn!( + "RSC combined payload contains invalid or incomplete T-chunks; skipping rewriting to avoid breaking hydration" + ); + return payloads.iter().map(|p| (*p).to_string()).collect(); + }; + if chunks.is_empty() { + return payloads + .iter() + .map(|p| rewriter.rewrite_to_string(p)) + .collect(); + } + + let mut result = String::with_capacity(combined.len()); + let mut last_end = 0; + + for chunk in &chunks { + let before = &combined[last_end..chunk.match_start]; + result.push_str(rewriter.rewrite(before).as_ref()); + + let chunk_content = &combined[chunk.header_end..chunk.content_end]; + let rewritten_content = rewriter.rewrite_to_string(chunk_content); + + let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); + let new_length_hex = format!("{new_length:x}"); + + result.push_str(&combined[chunk.match_start..chunk.id_end]); + result.push_str(":T"); + result.push_str(&new_length_hex); + result.push(','); + result.push_str(&rewritten_content); + + last_end = chunk.content_end; + } + + let remaining = &combined[last_end..]; + result.push_str(rewriter.rewrite(remaining).as_ref()); + + result.split(RSC_MARKER).map(|s| s.to_string()).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tchunk_length_recalculation() { + let content = r#"1a:T29,{"url":"https://origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert!( + result.contains("test.example.com"), + "URL should be rewritten" + ); + assert!( + result.starts_with("1a:T27,"), + "T-chunk length should be updated from 29 (41) to 27 (39). Got: {}", + result + ); + } + + #[test] + fn tchunk_length_recalculation_with_length_increase() { + let content = r#"1a:T1c,{"url":"https://short.io/x"}"#; + let rewriter = RscUrlRewriter::new("short.io", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert!( + result.contains("test.example.com"), + "URL should be rewritten" + ); + assert!( + result.starts_with("1a:T24,"), + "T-chunk length should be updated from 1c (28) to 24 (36). Got: {}", + result + ); + } + + #[test] + fn calculate_unescaped_byte_length_handles_common_escapes() { + assert_eq!(calculate_unescaped_byte_length("hello"), 5); + assert_eq!(calculate_unescaped_byte_length(r#"\n"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\r\n"#), 2); + assert_eq!(calculate_unescaped_byte_length(r#"\""#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\\"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\x41"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\u0041"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\u00e9"#), 2); + } + + #[test] + fn multiple_tchunks() { + let content = r#"1a:T1c,{"url":"https://short.io/x"}\n1b:T1c,{"url":"https://short.io/y"}"#; + let rewriter = RscUrlRewriter::new("short.io", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert!( + result.contains("test.example.com"), + "URLs should be rewritten" + ); + let count = result.matches(":T24,").count(); + assert_eq!(count, 2, "Both T-chunks should have updated lengths"); + } + + #[test] + fn cross_script_tchunk_rewriting() { + let script0 = r#"other:data\n1a:T3e,partial content"#; + let script1 = r#" with https://origin.example.com/page goes here"#; + + let combined_content = "partial content with https://origin.example.com/page goes here"; + let combined_len = calculate_unescaped_byte_length(combined_content); + println!( + "Combined T-chunk content length: {} bytes = 0x{:x}", + combined_len, combined_len + ); + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert_eq!(results.len(), 2, "Should return same number of scripts"); + assert!( + results[1].contains("test.example.com"), + "URL in script 1 should be rewritten. Got: {}", + results[1] + ); + + let rewritten_content = "partial content with https://test.example.com/page goes here"; + let rewritten_len = calculate_unescaped_byte_length(rewritten_content); + let expected_header = format!(":T{:x},", rewritten_len); + assert!( + results[0].contains(&expected_header), + "T-chunk length in script 0 should be updated to {}. Got: {}", + expected_header, + results[0] + ); + } + + #[test] + fn cross_script_preserves_non_tchunk_content() { + let script0 = r#"{"url":"https://origin.example.com/first"}\n1a:T38,partial"#; + let script1 = r#" content with https://origin.example.com/page end"#; + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert!( + results[0].contains("test.example.com/first"), + "URL outside T-chunk should be rewritten. Got: {}", + results[0] + ); + + assert!( + results[1].contains("test.example.com/page"), + "URL inside cross-script T-chunk should be rewritten. Got: {}", + results[1] + ); + } + + #[test] + fn preserves_protocol_relative_urls() { + let input = r#"{"url":"//origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewritten = rewriter.rewrite_to_string(input); + + assert!( + rewritten.contains(r#""url":"//proxy.example.com/path""#), + "Protocol-relative URL should remain protocol-relative. Got: {rewritten}", + ); + } + + #[test] + fn rewrites_bare_host_occurrences() { + let input = r#"{"siteProductionDomain":"origin.example.com"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewritten = rewriter.rewrite_to_string(input); + + assert!( + rewritten.contains(r#""siteProductionDomain":"proxy.example.com""#), + "Bare host should be rewritten inside RSC payload. Got: {rewritten}" + ); + } + + #[test] + fn bare_host_rewrite_respects_hostname_boundaries() { + let input = r#"{"sub":"cdn.origin.example.com","prefix":"notorigin.example.com","suffix":"origin.example.com.uk","path":"origin.example.com/news","exact":"origin.example.com"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewritten = rewriter.rewrite_to_string(input); + + assert!( + rewritten.contains(r#""sub":"cdn.origin.example.com""#), + "Subdomain should not be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""prefix":"notorigin.example.com""#), + "Prefix substring should not be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""suffix":"origin.example.com.uk""#), + "Suffix domain should not be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""path":"proxy.example.com/news""#), + "Bare host with path should be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""exact":"proxy.example.com""#), + "Exact bare host should be rewritten. Got: {rewritten}" + ); + } + + #[test] + fn single_payload_bypasses_combining() { + // When there's only one payload, we should process it directly without combining + // Content: {"url":"https://origin.example.com/x"} = 37 bytes = 0x25 hex + let payload = r#"1a:T25,{"url":"https://origin.example.com/x"}"#; + let payloads: Vec<&str> = vec![payload]; + + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert_eq!(results.len(), 1); + assert!( + results[0].contains("test.example.com"), + "Single payload should be rewritten. Got: {}", + results[0] + ); + // The length should be updated for the rewritten URL + // {"url":"https://test.example.com/x"} = 35 bytes = 0x23 hex + assert!( + results[0].contains(":T23,"), + "T-chunk length should be updated. Got: {}", + results[0] + ); + } + + #[test] + fn empty_payloads_returns_empty() { + let payloads: Vec<&str> = vec![]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + assert!(results.is_empty()); + } + + #[test] + fn no_origin_in_payloads_returns_unchanged() { + let payloads: Vec<&str> = vec![r#"1a:T10,{"key":"value"}"#, r#"1b:T10,{"foo":"bar"}"#]; + + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert_eq!(results.len(), 2); + // Content should be identical - note that T-chunk lengths may be recalculated + // even if content is unchanged (due to how the algorithm works) + assert!( + !results[0].contains("origin.example.com") && !results[0].contains("test.example.com"), + "No host should be present in payload without URLs" + ); + assert!( + !results[1].contains("origin.example.com") && !results[1].contains("test.example.com"), + "No host should be present in payload without URLs" + ); + // The content after T-chunk header should be preserved + assert!( + results[0].contains(r#"{"key":"value"}"#), + "Content should be preserved. Got: {}", + results[0] + ); + assert!( + results[1].contains(r#"{"foo":"bar"}"#), + "Content should be preserved. Got: {}", + results[1] + ); + } + + #[test] + fn size_limit_skips_rewrite_when_cross_script_tchunk_detected() { + let script0 = r#"other:data\n1a:T40,partial content"#; + let script1 = r#" with https://origin.example.com/page goes here"#; + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined_with_limit( + &payloads, + "origin.example.com", + "test.example.com", + "https", + 1, + ); + + assert_eq!(results.len(), 2, "Should return same number of scripts"); + assert_eq!( + results[0], script0, + "Cross-script payload should remain unchanged when size limit is exceeded" + ); + assert_eq!( + results[1], script1, + "Cross-script payload should remain unchanged when size limit is exceeded" + ); + } + + #[test] + fn size_limit_rewrites_individually_when_tchunks_are_complete() { + let script0 = r#"1a:T25,{"url":"https://origin.example.com/x"}"#; + let script1 = r#"1b:T25,{"url":"https://origin.example.com/y"}"#; + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined_with_limit( + &payloads, + "origin.example.com", + "test.example.com", + "https", + 1, + ); + + assert_eq!(results.len(), 2, "Should return same number of scripts"); + assert!( + results[0].contains("test.example.com"), + "First payload should be rewritten. Got: {}", + results[0] + ); + assert!( + results[1].contains("test.example.com"), + "Second payload should be rewritten. Got: {}", + results[1] + ); + assert!( + results[0].contains(":T23,"), + "First payload T-chunk length should be updated. Got: {}", + results[0] + ); + assert!( + results[1].contains(":T23,"), + "Second payload T-chunk length should be updated. Got: {}", + results[1] + ); + } + + #[test] + fn invalid_or_unreasonable_tchunk_length_skips_rewriting() { + let content = r#"1a:T10000000,{"url":"https://origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert_eq!( + result, content, + "Should skip rewriting when T-chunk length is unreasonable" + ); + } + + #[test] + fn incomplete_tchunk_skips_rewriting() { + let content = r#"1a:Tff,{"url":"https://origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert_eq!( + result, content, + "Should skip rewriting when T-chunk content is incomplete" + ); + } +} diff --git a/crates/common/src/integrations/nextjs/rsc_placeholders.rs b/crates/common/src/integrations/nextjs/rsc_placeholders.rs new file mode 100644 index 0000000..e26a98f --- /dev/null +++ b/crates/common/src/integrations/nextjs/rsc_placeholders.rs @@ -0,0 +1,207 @@ +use std::sync::{Arc, Mutex}; + +use crate::integrations::{ + IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, +}; + +use super::shared::find_rsc_push_payload_range; +use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; + +pub(super) const RSC_PAYLOAD_PLACEHOLDER_PREFIX: &str = "__ts_rsc_payload_"; +pub(super) const RSC_PAYLOAD_PLACEHOLDER_SUFFIX: &str = "__"; + +/// State for RSC placeholder-based rewriting. +/// +/// Stores RSC payloads extracted during streaming for later rewriting during post-processing. +/// Only unfragmented RSC scripts are processed during streaming; fragmented scripts are +/// handled by the post-processor which re-parses the final HTML. +#[derive(Default)] +pub(super) struct NextJsRscPostProcessState { + pub(super) payloads: Vec, +} + +impl NextJsRscPostProcessState { + pub(super) fn take_payloads(&mut self) -> Vec { + std::mem::take(&mut self.payloads) + } +} + +fn rsc_payload_placeholder(index: usize) -> String { + format!("{RSC_PAYLOAD_PLACEHOLDER_PREFIX}{index}{RSC_PAYLOAD_PLACEHOLDER_SUFFIX}") +} + +pub(super) struct NextJsRscPlaceholderRewriter { + config: Arc, +} + +impl NextJsRscPlaceholderRewriter { + pub(super) fn new(config: Arc) -> Self { + Self { config } + } +} + +impl IntegrationScriptRewriter for NextJsRscPlaceholderRewriter { + fn integration_id(&self) -> &'static str { + NEXTJS_INTEGRATION_ID + } + + fn selector(&self) -> &'static str { + "script" + } + + fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { + if !self.config.enabled || self.config.rewrite_attributes.is_empty() { + return ScriptRewriteAction::keep(); + } + + // Only process complete (unfragmented) scripts during streaming. + // Fragmented scripts are handled by the post-processor which re-parses the final HTML. + // This avoids corrupting non-RSC scripts that happen to be fragmented during streaming. + if !ctx.is_last_in_text_node { + // Script is fragmented - skip placeholder processing. + // The post-processor will handle RSC scripts at end-of-document. + return ScriptRewriteAction::keep(); + } + + // Quick check: skip scripts that can't be RSC payloads + if !content.contains("__next_f") { + return ScriptRewriteAction::keep(); + } + + let Some((payload_start, payload_end)) = find_rsc_push_payload_range(content) else { + // Contains __next_f but doesn't match RSC push pattern - leave unchanged + return ScriptRewriteAction::keep(); + }; + + if payload_start > payload_end + || payload_end > content.len() + || !content.is_char_boundary(payload_start) + || !content.is_char_boundary(payload_end) + { + return ScriptRewriteAction::keep(); + } + + // Insert placeholder for this RSC payload and store original for post-processing + let state = ctx + .document_state + .get_or_insert_with(NEXTJS_INTEGRATION_ID, || { + Mutex::new(NextJsRscPostProcessState::default()) + }); + let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); + + let placeholder_index = guard.payloads.len(); + let placeholder = rsc_payload_placeholder(placeholder_index); + guard + .payloads + .push(content[payload_start..payload_end].to_string()); + + let mut rewritten = content.to_string(); + rewritten.replace_range(payload_start..payload_end, &placeholder); + ScriptRewriteAction::replace(rewritten) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::integrations::IntegrationDocumentState; + + fn ctx<'a>( + is_last_in_text_node: bool, + document_state: &'a IntegrationDocumentState, + ) -> IntegrationScriptContext<'a> { + IntegrationScriptContext { + selector: "script", + request_host: "proxy.example.com", + request_scheme: "https", + origin_host: "origin.example.com", + is_last_in_text_node, + document_state, + } + } + + fn test_config() -> Arc { + Arc::new(NextJsIntegrationConfig { + enabled: true, + rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], + max_combined_payload_bytes: 10 * 1024 * 1024, + }) + } + + #[test] + fn inserts_placeholder_and_records_payload() { + let state = IntegrationDocumentState::default(); + let rewriter = NextJsRscPlaceholderRewriter::new(test_config()); + + let script = r#"self.__next_f.push([1,"https://origin.example.com/page"])"#; + let action = rewriter.rewrite(script, &ctx(true, &state)); + + let ScriptRewriteAction::Replace(rewritten) = action else { + panic!("Expected placeholder insertion to replace script"); + }; + assert!( + rewritten.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "Rewritten script should contain placeholder. Got: {rewritten}" + ); + + let stored = state + .get::>(NEXTJS_INTEGRATION_ID) + .expect("should store RSC state"); + let guard = stored.lock().expect("should lock Next.js RSC state"); + assert_eq!(guard.payloads.len(), 1, "Should store exactly one payload"); + assert_eq!( + guard.payloads[0], "https://origin.example.com/page", + "Stored payload should match original" + ); + } + + #[test] + fn skips_fragmented_scripts_for_post_processor_handling() { + // Fragmented scripts are not processed during streaming - they're passed through + // unchanged and handled by the post-processor which re-parses the final HTML. + let state = IntegrationDocumentState::default(); + let rewriter = NextJsRscPlaceholderRewriter::new(test_config()); + + let first = "self.__next_f.push([1,\"https://origin.example.com"; + let second = "/page\"])"; + + // Intermediate chunk should be kept (not processed) + let action_first = rewriter.rewrite(first, &ctx(false, &state)); + assert_eq!( + action_first, + ScriptRewriteAction::Keep, + "Intermediate chunk should be kept unchanged" + ); + + // Final chunk should also be kept since it doesn't contain the full RSC pattern + let action_second = rewriter.rewrite(second, &ctx(true, &state)); + assert_eq!( + action_second, + ScriptRewriteAction::Keep, + "Final chunk of fragmented script should be kept" + ); + + // No payloads should be stored - post-processor will handle this + assert!( + state + .get::>(NEXTJS_INTEGRATION_ID) + .is_none(), + "No RSC state should be created for fragmented scripts" + ); + } + + #[test] + fn skips_non_rsc_scripts() { + let state = IntegrationDocumentState::default(); + let rewriter = NextJsRscPlaceholderRewriter::new(test_config()); + + let script = r#"console.log("hello world");"#; + let action = rewriter.rewrite(script, &ctx(true, &state)); + + assert_eq!( + action, + ScriptRewriteAction::Keep, + "Non-RSC scripts should be kept unchanged" + ); + } +} diff --git a/crates/common/src/integrations/nextjs/script_rewriter.rs b/crates/common/src/integrations/nextjs/script_rewriter.rs new file mode 100644 index 0000000..4df3493 --- /dev/null +++ b/crates/common/src/integrations/nextjs/script_rewriter.rs @@ -0,0 +1,425 @@ +use std::sync::Arc; + +use regex::{escape, Regex}; + +use crate::integrations::{ + IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, +}; + +use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; + +pub(super) struct NextJsNextDataRewriter { + config: Arc, +} + +impl NextJsNextDataRewriter { + pub(super) fn new(config: Arc) -> Self { + Self { config } + } + + fn rewrite_structured( + &self, + content: &str, + ctx: &IntegrationScriptContext<'_>, + ) -> ScriptRewriteAction { + if ctx.origin_host.is_empty() + || ctx.request_host.is_empty() + || self.config.rewrite_attributes.is_empty() + { + return ScriptRewriteAction::keep(); + } + + let rewriter = UrlRewriter::new( + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + &self.config.rewrite_attributes, + ); + + if let Some(rewritten) = rewrite_nextjs_values_with_rewriter(content, &rewriter) { + ScriptRewriteAction::replace(rewritten) + } else { + ScriptRewriteAction::keep() + } + } +} + +impl IntegrationScriptRewriter for NextJsNextDataRewriter { + fn integration_id(&self) -> &'static str { + NEXTJS_INTEGRATION_ID + } + + fn selector(&self) -> &'static str { + "script#__NEXT_DATA__" + } + + fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { + if self.config.rewrite_attributes.is_empty() { + return ScriptRewriteAction::keep(); + } + + self.rewrite_structured(content, ctx) + } +} + +fn rewrite_nextjs_values_with_rewriter(content: &str, rewriter: &UrlRewriter) -> Option { + rewriter.rewrite_embedded(content) +} + +#[cfg(test)] +fn rewrite_nextjs_values( + content: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, + attributes: &[String], +) -> Option { + if origin_host.is_empty() || request_host.is_empty() || attributes.is_empty() { + return None; + } + + let rewriter = UrlRewriter::new(origin_host, request_host, request_scheme, attributes); + + rewrite_nextjs_values_with_rewriter(content, &rewriter) +} + +/// Rewrites URLs in structured Next.js JSON payloads (e.g., `__NEXT_DATA__`). +/// +/// This rewriter uses combined regex patterns to find and replace URLs +/// in JSON content. It handles full URLs, protocol-relative URLs, and bare hostnames. +/// Patterns for all attributes are combined with alternation for efficiency. +struct UrlRewriter { + #[cfg_attr(not(test), allow(dead_code))] + origin_host: String, + request_host: String, + request_scheme: String, + /// Single regex matching URL patterns for all attributes + embedded_pattern: Option, + /// Single regex matching bare hostname patterns for all attributes + bare_host_pattern: Option, +} + +impl UrlRewriter { + fn new( + origin_host: &str, + request_host: &str, + request_scheme: &str, + attributes: &[String], + ) -> Self { + let escaped_origin = escape(origin_host); + + // Build a single regex with alternation for all attributes + let embedded_pattern = if attributes.is_empty() { + None + } else { + let attr_alternation = attributes + .iter() + .map(|attr| escape(attr)) + .collect::>() + .join("|"); + let pattern = format!( + r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*")(?Phttps?://|//){origin}(?P[^"\\]*)(?P\\*")"#, + attrs = attr_alternation, + origin = escaped_origin, + ); + Some(Regex::new(&pattern).expect("valid Next.js rewrite regex")) + }; + + let bare_host_pattern = if attributes.is_empty() { + None + } else { + let attr_alternation = attributes + .iter() + .map(|attr| escape(attr)) + .collect::>() + .join("|"); + let pattern = format!( + r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*"){origin}(?P\\*")"#, + attrs = attr_alternation, + origin = escaped_origin, + ); + Some(Regex::new(&pattern).expect("valid Next.js bare host rewrite regex")) + }; + + Self { + origin_host: origin_host.to_string(), + request_host: request_host.to_string(), + request_scheme: request_scheme.to_string(), + embedded_pattern, + bare_host_pattern, + } + } + + #[cfg(test)] + fn rewrite_url_value(&self, url: &str) -> Option { + if let Some(rest) = url.strip_prefix("https://") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + return Some(format!( + "{}://{}{}", + self.request_scheme, self.request_host, path + )); + } + } else if let Some(rest) = url.strip_prefix("http://") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + return Some(format!( + "{}://{}{}", + self.request_scheme, self.request_host, path + )); + } + } else if let Some(rest) = url.strip_prefix("//") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + return Some(format!("//{}{}", self.request_host, path)); + } + } else if url == self.origin_host { + return Some(self.request_host.clone()); + } else if url.starts_with(&self.origin_host) { + let path = &url[self.origin_host.len()..]; + return Some(format!("{}{}", self.request_host, path)); + } + None + } + + fn rewrite_embedded(&self, input: &str) -> Option { + let mut result = input.to_string(); + let mut changed = false; + + if let Some(regex) = &self.embedded_pattern { + let request_host = &self.request_host; + let request_scheme = &self.request_scheme; + + let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { + let prefix = &caps["prefix"]; + let scheme = &caps["scheme"]; + let path = &caps["path"]; + let quote = &caps["quote"]; + + let new_url = if scheme == "//" { + format!("//{}{}", request_host, path) + } else { + format!("{}://{}{}", request_scheme, request_host, path) + }; + + format!("{prefix}{new_url}{quote}") + }); + + if next_value != result { + changed = true; + result = next_value.into_owned(); + } + } + + if let Some(regex) = &self.bare_host_pattern { + let request_host = &self.request_host; + + let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { + let prefix = &caps["prefix"]; + let suffix = &caps["suffix"]; + + format!("{prefix}{request_host}{suffix}") + }); + + if next_value != result { + changed = true; + result = next_value.into_owned(); + } + } + + changed.then_some(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::integrations::IntegrationDocumentState; + use crate::integrations::ScriptRewriteAction; + + fn test_config() -> Arc { + Arc::new(NextJsIntegrationConfig { + enabled: true, + rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], + max_combined_payload_bytes: 10 * 1024 * 1024, + }) + } + + fn ctx<'a>( + selector: &'static str, + document_state: &'a IntegrationDocumentState, + ) -> IntegrationScriptContext<'a> { + IntegrationScriptContext { + selector, + request_host: "ts.example.com", + request_scheme: "https", + origin_host: "origin.example.com", + is_last_in_text_node: true, + document_state, + } + } + + #[test] + fn structured_rewriter_updates_next_data_payload() { + let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; + let rewriter = NextJsNextDataRewriter::new(test_config()); + let document_state = IntegrationDocumentState::default(); + let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__", &document_state)); + + match result { + ScriptRewriteAction::Replace(value) => { + assert!(value.contains("ts.example.com") && value.contains("/reviews")); + assert!(value.contains("ts.example.com") && value.contains("/sign-in")); + assert!(value.contains(r#""fallbackHref":"http://origin.example.com/legacy""#)); + assert!(value.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#)); + } + _ => panic!("Expected rewrite to update payload"), + } + } + + #[test] + fn rewrite_helper_handles_protocol_relative_urls() { + let content = r#"{"props":{"pageProps":{"link":"//origin.example.com/image.png"}}}"#; + let rewritten = rewrite_nextjs_values( + content, + "origin.example.com", + "ts.example.com", + "https", + &["link".into()], + ) + .expect("should rewrite protocol relative link"); + + assert!(rewritten.contains("ts.example.com") && rewritten.contains("/image.png")); + } + + #[test] + fn truncated_string_without_urls_is_not_modified() { + let truncated = r#"self.__next_f.push([ + 1, + '430:I[6061,["749","static/chunks/16bf9003-553c36acd7d8a04b.js","4669","static/chun' +]);"#; + + let result = rewrite_nextjs_values( + truncated, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + ); + + assert!( + result.is_none(), + "Truncated content without URLs should not be modified" + ); + } + + #[test] + fn complete_string_with_url_is_rewritten() { + let complete = r#"self.__next_f.push([ + 1, + '{"url":"https://origin.example.com/path/to/resource"}' +]);"#; + + let result = rewrite_nextjs_values( + complete, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + ) + .expect("should rewrite URL"); + + assert!( + result.contains("proxy.example.com") && result.contains("/path/to/resource"), + "Complete URL should be rewritten. Got: {result}" + ); + } + + #[test] + fn truncated_url_without_closing_quote_is_not_modified() { + let truncated_url = r#"self.__next_f.push([ + 1, + '\"url\":\"https://origin.example.com/rss?title=%20' +]);"#; + + let result = rewrite_nextjs_values( + truncated_url, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + ); + + assert!( + result.is_none(), + "Truncated URL without closing quote should not be modified" + ); + } + + #[test] + fn backslash_n_is_preserved() { + let input = + r#"self.__next_f.push([1, 'foo\n{"url":"https://origin.example.com/test"}\nbar']);"#; + + let backslash_n_pos = input.find(r"\n").expect("should contain \\n"); + assert_eq!( + &input.as_bytes()[backslash_n_pos..backslash_n_pos + 2], + [0x5C, 0x6E], + "Input should have literal backslash-n" + ); + + let rewritten = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + ) + .expect("should rewrite URL"); + + let new_pos = rewritten.find(r"\n").expect("should contain \\n"); + assert_eq!( + &rewritten.as_bytes()[new_pos..new_pos + 2], + [0x5C, 0x6E], + "Rewritten should preserve literal backslash-n" + ); + } + + #[test] + fn site_production_domain_is_rewritten() { + let input = r#"self.__next_f.push([1, '{"siteProductionDomain":"origin.example.com","url":"https://origin.example.com/news"}']);"#; + + let rewritten = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into(), "siteProductionDomain".into()], + ) + .expect("should rewrite URLs"); + + assert!( + rewritten.contains("proxy.example.com") && rewritten.contains("/news"), + "Expected host to be rewritten. Got: {rewritten}" + ); + assert!( + !rewritten.contains("origin.example.com"), + "Original host should not remain" + ); + } + + #[test] + fn url_rewriter_rewrites_url() { + let rewriter = UrlRewriter::new( + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + ); + + let new_url = rewriter + .rewrite_url_value("https://origin.example.com/news") + .expect("URL should be rewritten"); + assert_eq!(new_url, "http://proxy.example.com/news"); + } +} diff --git a/crates/common/src/integrations/nextjs/shared.rs b/crates/common/src/integrations/nextjs/shared.rs new file mode 100644 index 0000000..aedac84 --- /dev/null +++ b/crates/common/src/integrations/nextjs/shared.rs @@ -0,0 +1,229 @@ +//! Shared utilities for Next.js integration modules. + +use std::borrow::Cow; + +use once_cell::sync::Lazy; +use regex::{escape, Regex}; + +use crate::host_rewrite::rewrite_bare_host_at_boundaries; + +/// RSC push script call pattern for extracting payload string boundaries. +pub(crate) static RSC_PUSH_CALL_PATTERN: Lazy = Lazy::new(|| { + Regex::new( + r#"(?s)(?:(?:self|window)\.__next_f\.push|\(\s*(?:self|window)\.__next_f\s*=\s*(?:self|window)\.__next_f\s*\|\|\s*\[\]\s*\)\s*\.push)\(\[\s*1\s*,\s*(['"])"#, + ) + .expect("valid RSC push call regex") +}); + +/// Find the payload string boundaries within an RSC push script. +/// +/// Returns `Some((start, end))` where `start` is the position after the opening quote +/// and `end` is the position of the closing quote. +pub(crate) fn find_rsc_push_payload_range(script: &str) -> Option<(usize, usize)> { + let cap = RSC_PUSH_CALL_PATTERN.captures(script)?; + let quote_match = cap.get(1)?; + let quote = quote_match + .as_str() + .chars() + .next() + .expect("push call regex should capture a quote character"); + let payload_start = quote_match.end(); + + let bytes = script.as_bytes(); + let mut i = payload_start; + while i < bytes.len() { + if bytes[i] == b'\\' && i + 1 < bytes.len() { + i += 2; + } else if bytes[i] == b'\\' { + return None; + } else if bytes[i] == quote as u8 { + return Some((payload_start, i)); + } else { + i += 1; + } + } + + None +} + +// ============================================================================= +// URL Rewriting +// ============================================================================= + +/// Rewriter for URL patterns in RSC payloads. +/// +/// This rewrites all occurrences of origin URLs in content, including: +/// - Full URLs: `https://origin.example.com/path` or `http://origin.example.com/path` +/// - Protocol-relative: `//origin.example.com/path` +/// - Escaped variants: `\/\/origin.example.com` (JSON-escaped) +/// - Bare hostnames: `origin.example.com` (as JSON values) +/// +/// Use this for RSC T-chunk content where any origin URL should be rewritten. +/// For attribute-specific rewriting (e.g., only rewrite `"href"` values), use +/// the `UrlRewriter` in `script_rewriter.rs` instead. +pub(crate) struct RscUrlRewriter { + origin_host: String, + request_host: String, + request_scheme: String, + pattern: Regex, +} + +impl RscUrlRewriter { + pub(crate) fn new(origin_host: &str, request_host: &str, request_scheme: &str) -> Self { + let escaped_origin = escape(origin_host); + + // Match: + // - https://origin_host or http://origin_host + // - //origin_host (protocol-relative) + // - escaped variants inside JSON-in-JS strings (e.g., \/\/origin_host) + let pattern = Regex::new(&format!( + r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, + escaped_origin + )) + .expect("valid RSC URL rewrite regex"); + + Self { + origin_host: origin_host.to_string(), + request_host: request_host.to_string(), + request_scheme: request_scheme.to_string(), + pattern, + } + } + + pub(crate) fn rewrite<'a>(&self, input: &'a str) -> Cow<'a, str> { + if !input.contains(&self.origin_host) { + return Cow::Borrowed(input); + } + + // Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.) + let replaced = self + .pattern + .replace_all(input, |caps: ®ex::Captures<'_>| { + let slashes = caps.get(3).map_or("//", |m| m.as_str()); + if caps.get(1).is_some() { + format!("{}:{}{}", self.request_scheme, slashes, self.request_host) + } else { + format!("{}{}", slashes, self.request_host) + } + }); + + // Phase 2: Handle bare host occurrences not matched by the URL regex + // (e.g., `siteProductionDomain`). Only check if regex made no changes, + // because if it did, we already know origin_host was present. + let text = match &replaced { + Cow::Borrowed(s) => *s, + Cow::Owned(s) => s.as_str(), + }; + + if !text.contains(&self.origin_host) { + return replaced; + } + + rewrite_bare_host_at_boundaries(text, &self.origin_host, &self.request_host) + .map(Cow::Owned) + .unwrap_or(replaced) + } + + pub(crate) fn rewrite_to_string(&self, input: &str) -> String { + self.rewrite(input).into_owned() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn finds_double_quoted_payload() { + let script = r#"self.__next_f.push([1,"hello world"])"#; + let (start, end) = find_rsc_push_payload_range(script).expect("should find payload"); + assert_eq!(&script[start..end], "hello world"); + } + + #[test] + fn finds_single_quoted_payload() { + let script = r#"self.__next_f.push([1,'hello world'])"#; + let (start, end) = find_rsc_push_payload_range(script).expect("should find payload"); + assert_eq!(&script[start..end], "hello world"); + } + + #[test] + fn finds_assignment_form() { + let script = r#"(self.__next_f=self.__next_f||[]).push([1,"payload"])"#; + let (start, end) = find_rsc_push_payload_range(script).expect("should find payload"); + assert_eq!(&script[start..end], "payload"); + } + + #[test] + fn returns_none_for_trailing_backslash() { + let script = r#"self.__next_f.push([1,"incomplete\"])"#; + assert!(find_rsc_push_payload_range(script).is_none()); + } + + #[test] + fn returns_none_for_unterminated_string() { + let script = r#"self.__next_f.push([1,"no closing quote"#; + assert!(find_rsc_push_payload_range(script).is_none()); + } + + // RscUrlRewriter tests + + #[test] + fn rsc_url_rewriter_rewrites_https_url() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"https://origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"https://proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_http_url() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "http"); + let input = r#"{"url":"http://origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"http://proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_protocol_relative_url() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"//origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"//proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_escaped_slashes() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"\/\/origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"\/\/proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_bare_host() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"siteProductionDomain":"origin.example.com"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"siteProductionDomain":"proxy.example.com"}"#); + } + + #[test] + fn rsc_url_rewriter_does_not_rewrite_partial_hostname() { + let rewriter = RscUrlRewriter::new("example.com", "proxy.example.com", "https"); + let input = r#"{"domain":"subexample.com"}"#; + let result = rewriter.rewrite(input); + // Should not rewrite because "example.com" is not a standalone host here + assert_eq!(result, r#"{"domain":"subexample.com"}"#); + } + + #[test] + fn rsc_url_rewriter_no_change_when_origin_not_present() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"https://other.example.com/path"}"#; + let result = rewriter.rewrite(input); + // Should return borrowed reference (no allocation) + assert!(matches!(result, Cow::Borrowed(_))); + assert_eq!(result, input); + } +} diff --git a/crates/common/src/integrations/registry.rs b/crates/common/src/integrations/registry.rs index 9db8033..819890d 100644 --- a/crates/common/src/integrations/registry.rs +++ b/crates/common/src/integrations/registry.rs @@ -1,5 +1,6 @@ +use std::any::Any; use std::collections::BTreeMap; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use async_trait::async_trait; use error_stack::Report; @@ -87,6 +88,82 @@ pub struct IntegrationScriptContext<'a> { pub request_host: &'a str, pub request_scheme: &'a str, pub origin_host: &'a str, + pub is_last_in_text_node: bool, + pub document_state: &'a IntegrationDocumentState, +} + +/// Per-document state shared between HTML/script rewriters and post-processors. +/// +/// This exists to support multi-phase HTML processing without requiring a second HTML parse. +#[derive(Clone, Default)] +pub struct IntegrationDocumentState { + inner: Arc>>>, +} + +impl std::fmt::Debug for IntegrationDocumentState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let keys: Vec<&'static str> = { + let guard = self + .inner + .lock() + .expect("should lock integration document state"); + guard.keys().copied().collect() + }; + f.debug_struct("IntegrationDocumentState") + .field("keys", &keys) + .finish() + } +} + +impl IntegrationDocumentState { + pub fn get(&self, integration_id: &'static str) -> Option> + where + T: Any + Send + Sync + 'static, + { + let guard = self + .inner + .lock() + .expect("should lock integration document state"); + guard.get(integration_id).and_then(|value| { + let cloned: Arc = Arc::clone(value); + cloned.downcast::().ok() + }) + } + + pub fn get_or_insert_with( + &self, + integration_id: &'static str, + init: impl FnOnce() -> T, + ) -> Arc + where + T: Any + Send + Sync + 'static, + { + let mut guard = self + .inner + .lock() + .expect("should lock integration document state"); + + if let Some(existing) = guard.get(integration_id) { + if let Ok(downcast) = Arc::clone(existing).downcast::() { + return downcast; + } + } + + let value: Arc = Arc::new(init()); + guard.insert( + integration_id, + Arc::clone(&value) as Arc, + ); + value + } + + pub fn clear(&self) { + let mut guard = self + .inner + .lock() + .expect("should lock integration document state"); + guard.clear(); + } } /// Describes an HTTP endpoint exposed by an integration. @@ -249,12 +326,44 @@ pub trait IntegrationScriptRewriter: Send + Sync { fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction; } +/// Context for HTML post-processors. +#[derive(Debug)] +pub struct IntegrationHtmlContext<'a> { + pub request_host: &'a str, + pub request_scheme: &'a str, + pub origin_host: &'a str, + pub document_state: &'a IntegrationDocumentState, +} + +/// Trait for integration-provided HTML post-processors. +/// These run after streaming HTML processing to handle cases that require +/// access to the complete HTML (e.g., cross-script RSC T-chunks). +pub trait IntegrationHtmlPostProcessor: Send + Sync { + /// Identifier for logging/diagnostics. + fn integration_id(&self) -> &'static str; + + /// Fast preflight check to decide whether post-processing should run for this document. + /// + /// Implementations should keep this cheap (e.g., a substring check) because it may run on + /// every HTML response when the integration is enabled. + fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { + let _ = (html, ctx); + false + } + + /// Post-process complete HTML content. + /// This is called after streaming HTML processing with the complete HTML. + /// Implementations should mutate `html` in-place and return `true` when changes were made. + fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool; +} + /// Registration payload returned by integration builders. pub struct IntegrationRegistration { pub integration_id: &'static str, pub proxies: Vec>, pub attribute_rewriters: Vec>, pub script_rewriters: Vec>, + pub html_post_processors: Vec>, } impl IntegrationRegistration { @@ -276,6 +385,7 @@ impl IntegrationRegistrationBuilder { proxies: Vec::new(), attribute_rewriters: Vec::new(), script_rewriters: Vec::new(), + html_post_processors: Vec::new(), }, } } @@ -301,6 +411,15 @@ impl IntegrationRegistrationBuilder { self } + #[must_use] + pub fn with_html_post_processor( + mut self, + processor: Arc, + ) -> Self { + self.registration.html_post_processors.push(processor); + self + } + #[must_use] pub fn build(self) -> IntegrationRegistration { self.registration @@ -321,6 +440,7 @@ struct IntegrationRegistryInner { routes: Vec<(IntegrationEndpoint, &'static str)>, html_rewriters: Vec>, script_rewriters: Vec>, + html_post_processors: Vec>, } impl Default for IntegrationRegistryInner { @@ -334,6 +454,7 @@ impl Default for IntegrationRegistryInner { routes: Vec::new(), html_rewriters: Vec::new(), script_rewriters: Vec::new(), + html_post_processors: Vec::new(), } } } @@ -415,6 +536,9 @@ impl IntegrationRegistry { inner .script_rewriters .extend(registration.script_rewriters.into_iter()); + inner + .html_post_processors + .extend(registration.html_post_processors.into_iter()); } } @@ -493,6 +617,11 @@ impl IntegrationRegistry { self.inner.script_rewriters.clone() } + /// Expose registered HTML post-processors. + pub fn html_post_processors(&self) -> Vec> { + self.inner.html_post_processors.clone() + } + /// Provide a snapshot of registered integrations and their hooks. pub fn registered_integrations(&self) -> Vec { let mut map: BTreeMap<&'static str, IntegrationMetadata> = BTreeMap::new(); @@ -538,6 +667,7 @@ impl IntegrationRegistry { routes: Vec::new(), html_rewriters: attribute_rewriters, script_rewriters, + html_post_processors: Vec::new(), }), } } @@ -580,6 +710,7 @@ impl IntegrationRegistry { routes: Vec::new(), html_rewriters: Vec::new(), script_rewriters: Vec::new(), + html_post_processors: Vec::new(), }), } } @@ -611,6 +742,35 @@ mod tests { } } + struct NoopHtmlPostProcessor; + + impl IntegrationHtmlPostProcessor for NoopHtmlPostProcessor { + fn integration_id(&self) -> &'static str { + "noop" + } + + fn post_process(&self, _html: &mut String, _ctx: &IntegrationHtmlContext<'_>) -> bool { + false + } + } + + #[test] + fn default_html_post_processor_should_process_is_false() { + let processor = NoopHtmlPostProcessor; + let document_state = IntegrationDocumentState::default(); + let ctx = IntegrationHtmlContext { + request_host: "proxy.example.com", + request_scheme: "https", + origin_host: "origin.example.com", + document_state: &document_state, + }; + + assert!( + !processor.should_process("", &ctx), + "Default `should_process` should be false to avoid running post-processing unexpectedly" + ); + } + #[test] fn test_exact_route_matching() { let routes = vec![( diff --git a/crates/common/src/lib.rs b/crates/common/src/lib.rs index fc0b888..b9f5fd5 100644 --- a/crates/common/src/lib.rs +++ b/crates/common/src/lib.rs @@ -29,6 +29,7 @@ pub mod creative; pub mod error; pub mod fastly_storage; pub mod geo; +pub(crate) mod host_rewrite; pub mod html_processor; pub mod http_util; pub mod integrations; @@ -37,6 +38,7 @@ pub mod openrtb; pub mod proxy; pub mod publisher; pub mod request_signing; +pub mod rsc_flight; pub mod settings; pub mod settings_data; pub mod streaming_processor; diff --git a/crates/common/src/publisher.rs b/crates/common/src/publisher.rs index a8c13a2..6041536 100644 --- a/crates/common/src/publisher.rs +++ b/crates/common/src/publisher.rs @@ -9,6 +9,7 @@ use crate::constants::{HEADER_SYNTHETIC_TRUSTED_SERVER, HEADER_X_COMPRESS_HINT}; use crate::cookies::create_synthetic_cookie; use crate::error::TrustedServerError; use crate::integrations::IntegrationRegistry; +use crate::rsc_flight::RscFlightUrlRewriter; use crate::settings::Settings; use crate::streaming_processor::{Compression, PipelineConfig, StreamProcessor, StreamingPipeline}; use crate::streaming_replacer::create_url_replacer; @@ -116,6 +117,15 @@ fn process_response_streaming( ) -> Result> { // Check if this is HTML content let is_html = params.content_type.contains("text/html"); + let is_rsc_flight = params.content_type.contains("text/x-component"); + log::debug!( + "process_response_streaming: content_type={}, content_encoding={}, is_html={}, is_rsc_flight={}, origin_host={}", + params.content_type, + params.content_encoding, + is_html, + is_rsc_flight, + params.origin_host + ); // Determine compression type let compression = Compression::from_content_encoding(params.content_encoding); @@ -140,6 +150,24 @@ fn process_response_streaming( chunk_size: 8192, }; + let mut pipeline = StreamingPipeline::new(config, processor); + pipeline.process(body, &mut output)?; + } else if is_rsc_flight { + // RSC Flight responses are length-prefixed (T rows). A naive string replacement will + // corrupt the stream by changing byte lengths without updating the prefixes. + let processor = RscFlightUrlRewriter::new( + params.origin_host, + params.origin_url, + params.request_host, + params.request_scheme, + ); + + let config = PipelineConfig { + input_compression: compression, + output_compression: compression, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(config, processor); pipeline.process(body, &mut output)?; } else { @@ -161,7 +189,7 @@ fn process_response_streaming( pipeline.process(body, &mut output)?; } - log::info!( + log::debug!( "Streaming processing complete - output size: {} bytes", output.len() ); @@ -205,7 +233,7 @@ pub fn handle_publisher_request( integration_registry: &IntegrationRegistry, mut req: Request, ) -> Result> { - log::info!("Proxying request to publisher_origin"); + log::debug!("Proxying request to publisher_origin"); // Prebid.js requests are not intercepted here anymore. The HTML processor rewrites // any Prebid script references to `/static/tsjs-ext.min.js` when auto-configure is enabled. @@ -221,7 +249,7 @@ pub fn handle_publisher_request( let request_scheme = detect_request_scheme(&req); // Log detection details for debugging - log::info!( + log::debug!( "Scheme detection - TLS Protocol: {:?}, TLS Cipher: {:?}, Forwarded: {:?}, X-Forwarded-Proto: {:?}, Fastly-SSL: {:?}, Result: {}", req.get_tls_protocol(), req.get_tls_cipher_openssl_name(), @@ -231,7 +259,7 @@ pub fn handle_publisher_request( request_scheme ); - log::info!("Request host: {}, scheme: {}", request_host, request_scheme); + log::debug!("Request host: {}, scheme: {}", request_host, request_scheme); // Generate synthetic identifiers before the request body is consumed. let synthetic_id = get_or_generate_synthetic_id(settings, &req)?; @@ -245,7 +273,7 @@ pub fn handle_publisher_request( }) .unwrap_or(false); - log::info!( + log::debug!( "Proxy synthetic IDs - trusted: {}, has_cookie: {}", synthetic_id, has_synthetic_cookie @@ -254,7 +282,7 @@ pub fn handle_publisher_request( let backend_name = ensure_backend_from_url(&settings.publisher.origin_url)?; let origin_host = settings.publisher.origin_host(); - log::info!( + log::debug!( "Proxying to dynamic backend: {} (from {})", backend_name, settings.publisher.origin_url @@ -268,9 +296,9 @@ pub fn handle_publisher_request( })?; // Log all response headers for debugging - log::info!("Response headers:"); + log::debug!("Response headers:"); for (name, value) in response.get_headers() { - log::info!(" {}: {:?}", name, value); + log::debug!(" {}: {:?}", name, value); } // Check if the response has a text-based content type that we should process @@ -293,7 +321,7 @@ pub fn handle_publisher_request( .to_lowercase(); // Log response details for debugging - log::info!( + log::debug!( "Processing response - Content-Type: {}, Content-Encoding: {}, Request Host: {}, Origin Host: {}", content_type, content_encoding, request_host, origin_host ); @@ -321,12 +349,12 @@ pub fn handle_publisher_request( response.remove_header(header::CONTENT_LENGTH); // Keep Content-Encoding header since we're returning compressed content - log::info!( + log::debug!( "Preserved Content-Encoding: {} for compressed response", content_encoding ); - log::info!("Completed streaming processing of response body"); + log::debug!("Completed streaming processing of response body"); } Err(e) => { log::error!("Failed to process response body: {:?}", e); @@ -335,7 +363,7 @@ pub fn handle_publisher_request( } } } else { - log::info!( + log::debug!( "Skipping response processing - should_process: {}, request_host: '{}'", should_process, request_host diff --git a/crates/common/src/rsc_flight.rs b/crates/common/src/rsc_flight.rs new file mode 100644 index 0000000..309e950 --- /dev/null +++ b/crates/common/src/rsc_flight.rs @@ -0,0 +1,391 @@ +use std::io; + +use crate::host_rewrite::rewrite_bare_host_at_boundaries; +use crate::streaming_processor::StreamProcessor; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum RowState { + Id, + Tag, + Length, + ChunkByNewline, + ChunkByLength, +} + +/// Rewrites URLs inside a React Server Components (RSC) Flight stream. +/// +/// Next.js (App Router) uses `react-server-dom-webpack` ("Flight") for navigation responses +/// and for inlined `__next_f` data. The wire format is a sequence of rows: +/// - `:\n` (JSON terminated by `\n`) +/// - `:\n` (tagged, terminated by `\n`) +/// - `:T,` (tagged by `T`, length-delimited, **no trailing newline**) +/// +/// For `T` rows, the length prefix is the UTF-8 byte length of the content bytes. If we rewrite +/// URLs inside the content, we must recompute the length and rewrite the header. +/// +/// ## Limitations +/// +/// This rewriter performs simple string replacement and does NOT handle JSON escape sequences. +/// URLs like `\/\/origin.example.com` (JSON-escaped slashes) will not be rewritten. This is +/// acceptable because Flight responses from client-side navigation typically contain plain URLs, +/// not doubly-escaped JSON-in-JS content. For inlined `__next_f` data in HTML (which can have +/// escape sequences), the HTML post-processor in `integrations/nextjs/` handles those cases. +pub struct RscFlightUrlRewriter { + origin_url: String, + origin_http_url: Option, + origin_host: String, + origin_protocol_relative: String, + request_url: String, + request_host: String, + request_protocol_relative: String, + + state: RowState, + row_id: Vec, + row_tag: Option, + declared_length: usize, + remaining_length: usize, + row_content: Vec, + raw_header: Vec, +} + +impl RscFlightUrlRewriter { + #[must_use] + pub fn new( + origin_host: &str, + origin_url: &str, + request_host: &str, + request_scheme: &str, + ) -> Self { + let request_url = format!("{request_scheme}://{request_host}"); + let origin_protocol_relative = format!("//{origin_host}"); + let request_protocol_relative = format!("//{request_host}"); + + let origin_http_url = origin_url + .strip_prefix("https://") + .map(|rest| format!("http://{rest}")); + + Self { + origin_url: origin_url.to_string(), + origin_http_url, + origin_host: origin_host.to_string(), + origin_protocol_relative, + request_url, + request_host: request_host.to_string(), + request_protocol_relative, + state: RowState::Id, + row_id: Vec::new(), + row_tag: None, + declared_length: 0, + remaining_length: 0, + row_content: Vec::new(), + raw_header: Vec::new(), + } + } + + fn reset_row(&mut self) { + self.state = RowState::Id; + self.row_id.clear(); + self.row_tag = None; + self.declared_length = 0; + self.remaining_length = 0; + self.row_content.clear(); + self.raw_header.clear(); + } + + fn rewrite_utf8_bytes(&self, bytes: &[u8]) -> Vec { + let Ok(text) = std::str::from_utf8(bytes) else { + return bytes.to_vec(); + }; + + if !text.contains(&self.origin_host) && !text.contains(&self.origin_url) { + if let Some(http_url) = &self.origin_http_url { + if !text.contains(http_url) { + return bytes.to_vec(); + } + } else { + return bytes.to_vec(); + } + } + + // Keep replacement semantics consistent with `create_url_replacer`. + let mut rewritten = text.replace(&self.origin_url, &self.request_url); + if let Some(http_url) = &self.origin_http_url { + rewritten = rewritten.replace(http_url, &self.request_url); + } + rewritten = rewritten.replace( + &self.origin_protocol_relative, + &self.request_protocol_relative, + ); + rewritten = + rewrite_bare_host_at_boundaries(&rewritten, &self.origin_host, &self.request_host) + .unwrap_or(rewritten); + + rewritten.into_bytes() + } + + fn finalize_newline_row(&mut self, out: &mut Vec) { + out.extend_from_slice(&self.row_id); + out.push(b':'); + if let Some(tag) = self.row_tag { + out.push(tag); + } + let rewritten = self.rewrite_utf8_bytes(&self.row_content); + out.extend_from_slice(&rewritten); + out.push(b'\n'); + self.reset_row(); + } + + fn finalize_length_row(&mut self, out: &mut Vec) { + let Some(tag) = self.row_tag else { + // Should never happen for length-delimited rows; fall back to passthrough. + out.extend_from_slice(&self.raw_header); + out.extend_from_slice(&self.row_content); + self.reset_row(); + return; + }; + + out.extend_from_slice(&self.row_id); + out.push(b':'); + out.push(tag); + + if tag == b'T' { + let rewritten = self.rewrite_utf8_bytes(&self.row_content); + let new_len = rewritten.len(); + out.extend_from_slice(format!("{new_len:x}").as_bytes()); + out.push(b','); + out.extend_from_slice(&rewritten); + } else { + // Length-delimited row type we don't transform (e.g., future/binary Flight types). + out.extend_from_slice(format!("{:x}", self.declared_length).as_bytes()); + out.push(b','); + out.extend_from_slice(&self.row_content); + } + + self.reset_row(); + } + + fn flush_partial_row(&mut self, out: &mut Vec) { + if self.raw_header.is_empty() && self.row_content.is_empty() { + return; + } + out.extend_from_slice(&self.raw_header); + out.extend_from_slice(&self.row_content); + self.reset_row(); + } +} + +impl StreamProcessor for RscFlightUrlRewriter { + fn process_chunk(&mut self, chunk: &[u8], is_last: bool) -> Result, io::Error> { + let mut out = Vec::with_capacity(chunk.len()); + let mut i = 0; + + while i < chunk.len() { + match self.state { + RowState::Id => { + let b = chunk[i]; + i += 1; + if b == b':' { + self.raw_header.push(b':'); + self.state = RowState::Tag; + } else { + self.row_id.push(b); + self.raw_header.push(b); + } + } + RowState::Tag => { + let b = chunk[i]; + i += 1; + + if b == b'T' || b == b'V' { + self.row_tag = Some(b); + self.raw_header.push(b); + self.state = RowState::Length; + self.declared_length = 0; + } else if b.is_ascii_uppercase() { + self.row_tag = Some(b); + self.raw_header.push(b); + self.state = RowState::ChunkByNewline; + } else { + // Not a recognized tag; treat as first byte of a JSON row. + self.row_tag = None; + self.row_content.push(b); + self.state = RowState::ChunkByNewline; + } + } + RowState::Length => { + let b = chunk[i]; + i += 1; + if b == b',' { + self.raw_header.push(b','); + self.remaining_length = self.declared_length; + self.state = RowState::ChunkByLength; + } else { + self.raw_header.push(b); + let digit = match b { + b'0'..=b'9' => (b - b'0') as usize, + b'a'..=b'f' => (b - b'a' + 10) as usize, + b'A'..=b'F' => (b - b'A' + 10) as usize, + _ => 0, + }; + self.declared_length = (self.declared_length << 4) | digit; + } + } + RowState::ChunkByNewline => { + let Some(pos) = chunk[i..].iter().position(|&b| b == b'\n') else { + self.row_content.extend_from_slice(&chunk[i..]); + break; + }; + let end = i + pos; + self.row_content.extend_from_slice(&chunk[i..end]); + i = end + 1; // Skip '\n' + self.finalize_newline_row(&mut out); + } + RowState::ChunkByLength => { + let available = chunk.len() - i; + let take = available.min(self.remaining_length); + self.row_content.extend_from_slice(&chunk[i..i + take]); + i += take; + self.remaining_length -= take; + + if self.remaining_length == 0 { + self.finalize_length_row(&mut out); + } + } + } + } + + if is_last { + self.flush_partial_row(&mut out); + } + + Ok(out) + } + + fn reset(&mut self) { + self.reset_row(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn run_rewriter( + rewriter: &mut RscFlightUrlRewriter, + input: &[u8], + chunk_size: usize, + ) -> Vec { + let mut output = Vec::new(); + let mut pos = 0; + while pos < input.len() { + let end = (pos + chunk_size).min(input.len()); + let chunk = &input[pos..end]; + let rewritten = rewriter + .process_chunk(chunk, false) + .expect("should process chunk"); + output.extend_from_slice(&rewritten); + pos = end; + } + + let tail = rewriter + .process_chunk(&[], true) + .expect("should process final chunk"); + output.extend_from_slice(&tail); + output + } + + #[test] + fn rewrites_newline_rows() { + let input = b"0:[\"https://origin.example.com/page\"]\n"; + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input, 8); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + assert_eq!( + output_str, "0:[\"https://proxy.example.com/page\"]\n", + "Output should rewrite URLs in newline rows" + ); + } + + #[test] + fn rewrites_t_rows_and_updates_length() { + let t_content = r#"{"url":"https://origin.example.com/page"}"#; + let json_row = "2:[\"ok\"]\n"; + let input = format!("1:T{:x},{}{}", t_content.len(), t_content, json_row); + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input.as_bytes(), 7); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + + let rewritten_t_content = r#"{"url":"https://proxy.example.com/page"}"#; + let expected = format!( + "1:T{:x},{}{}", + rewritten_t_content.len(), + rewritten_t_content, + json_row + ); + + assert_eq!( + output_str, expected, + "Output should update T row lengths after rewriting" + ); + } + + #[test] + fn handles_t_row_header_and_body_split_across_chunks() { + let t_content = r#"{"url":"https://origin.example.com/page"}"#; + let input = format!("1:T{:x},{}", t_content.len(), t_content); + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + // Split such that the header ends before the comma and content begins in a later chunk. + let output = run_rewriter(&mut rewriter, input.as_bytes(), 3); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + + let rewritten_t_content = r#"{"url":"https://proxy.example.com/page"}"#; + let expected = format!("1:T{:x},{}", rewritten_t_content.len(), rewritten_t_content,); + + assert_eq!( + output_str, expected, + "Rewriter should handle T rows split across chunks" + ); + } + + #[test] + fn bare_host_rewrite_respects_hostname_boundaries() { + let input = b"0:[\"cdn.origin.example.com\",\"notorigin.example.com\",\"origin.example.com.uk\",\"origin.example.com/news\",\"origin.example.com\"]\n"; + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input, 5); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + + assert_eq!( + output_str, + "0:[\"cdn.origin.example.com\",\"notorigin.example.com\",\"origin.example.com.uk\",\"proxy.example.com/news\",\"proxy.example.com\"]\n", + "Output should only rewrite bare host occurrences" + ); + } +} diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index 25149d9..4a41b1e 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -53,6 +53,17 @@ impl Publisher { }) .unwrap_or_else(|| self.origin_url.clone()) } + + fn normalize(&mut self) { + let trimmed = self.origin_url.trim_end_matches('/'); + if trimmed != self.origin_url { + log::warn!( + "publisher.origin_url ends with '/': normalizing to {}", + trimmed + ); + self.origin_url = trimmed.to_string(); + } + } } #[derive(Debug, Default, Deserialize, Serialize)] @@ -318,12 +329,15 @@ impl Settings { .change_context(TrustedServerError::Configuration { message: "Failed to build configuration".to_string(), })?; - // You can deserialize (and thus freeze) the entire configuration as - config - .try_deserialize() - .change_context(TrustedServerError::Configuration { - message: "Failed to deserialize configuration".to_string(), - }) + let mut settings: Self = + config + .try_deserialize() + .change_context(TrustedServerError::Configuration { + message: "Failed to deserialize configuration".to_string(), + })?; + + settings.publisher.normalize(); + Ok(settings) } #[must_use] @@ -417,6 +431,7 @@ mod tests { use serde_json::json; use crate::integrations::{nextjs::NextJsIntegrationConfig, prebid::PrebidIntegrationConfig}; + use crate::streaming_replacer::create_url_replacer; use crate::test_support::tests::{crate_test_settings_str, create_test_settings}; #[test] @@ -450,8 +465,8 @@ mod tests { assert_eq!(raw_nextjs["enabled"], json!(false)); assert_eq!( raw_nextjs["rewrite_attributes"], - json!(["href", "link", "url"]), - "Next.js rewrite attributes should default to href/link/url" + json!(["href", "link", "siteBaseUrl", "siteProductionDomain", "url"]), + "Next.js rewrite attributes should include href/link/siteBaseUrl/siteProductionDomain/url for RSC navigation" ); assert!(!settings.synthetic.counter_store.is_empty()); @@ -507,6 +522,35 @@ mod tests { settings.validate().expect("Failed to validate settings"); } + #[test] + fn from_toml_normalizes_trailing_slash_in_origin_url() { + let toml_str = crate_test_settings_str().replace( + r#"origin_url = "https://origin.test-publisher.com""#, + r#"origin_url = "https://origin.test-publisher.com/""#, + ); + + let settings = Settings::from_toml(&toml_str).expect("should parse valid TOML"); + assert_eq!( + settings.publisher.origin_url, "https://origin.test-publisher.com", + "origin_url should be normalized by trimming trailing slashes" + ); + + let origin_host = settings.publisher.origin_host(); + let mut replacer = create_url_replacer( + &origin_host, + &settings.publisher.origin_url, + "proxy.example.com", + "https", + ); + + let processed = replacer.process_chunk(b"https://origin.test-publisher.com/news", true); + let rewritten = String::from_utf8(processed).expect("should be valid UTF-8"); + assert_eq!( + rewritten, "https://proxy.example.com/news", + "rewriting should keep the delimiter slash between host and path" + ); + } + #[test] fn test_settings_missing_required_fields() { let re = Regex::new(r"origin_url = .*").unwrap(); diff --git a/crates/common/src/streaming_processor.rs b/crates/common/src/streaming_processor.rs index 88d3a64..eb5d608 100644 --- a/crates/common/src/streaming_processor.rs +++ b/crates/common/src/streaming_processor.rs @@ -92,12 +92,17 @@ impl StreamingPipeline

{ ) { (Compression::None, Compression::None) => self.process_uncompressed(input, output), (Compression::Gzip, Compression::Gzip) => self.process_gzip_to_gzip(input, output), + (Compression::Gzip, Compression::None) => self.process_gzip_to_none(input, output), (Compression::Deflate, Compression::Deflate) => { self.process_deflate_to_deflate(input, output) } + (Compression::Deflate, Compression::None) => { + self.process_deflate_to_none(input, output) + } (Compression::Brotli, Compression::Brotli) => { self.process_brotli_to_brotli(input, output) } + (Compression::Brotli, Compression::None) => self.process_brotli_to_none(input, output), _ => Err(Report::new(TrustedServerError::Proxy { message: "Unsupported compression transformation".to_string(), })), @@ -206,6 +211,48 @@ impl StreamingPipeline

{ Ok(()) } + /// Process gzip compressed input to uncompressed output (decompression only) + fn process_gzip_to_none( + &mut self, + input: R, + mut output: W, + ) -> Result<(), Report> { + use flate2::read::GzDecoder; + + // Decompress input + let mut decoder = GzDecoder::new(input); + let mut decompressed = Vec::new(); + decoder + .read_to_end(&mut decompressed) + .change_context(TrustedServerError::Proxy { + message: "Failed to decompress gzip".to_string(), + })?; + + log::info!( + "[Gzip->None] Decompressed size: {} bytes", + decompressed.len() + ); + + // Process the decompressed content + let processed = self + .processor + .process_chunk(&decompressed, true) + .change_context(TrustedServerError::Proxy { + message: "Failed to process content".to_string(), + })?; + + log::info!("[Gzip->None] Processed size: {} bytes", processed.len()); + + // Write uncompressed output + output + .write_all(&processed) + .change_context(TrustedServerError::Proxy { + message: "Failed to write output".to_string(), + })?; + + Ok(()) + } + /// Process deflate compressed stream fn process_deflate_to_deflate( &mut self, @@ -222,6 +269,48 @@ impl StreamingPipeline

{ self.process_through_compression(decoder, encoder) } + /// Process deflate compressed input to uncompressed output (decompression only) + fn process_deflate_to_none( + &mut self, + input: R, + mut output: W, + ) -> Result<(), Report> { + use flate2::read::ZlibDecoder; + + // Decompress input + let mut decoder = ZlibDecoder::new(input); + let mut decompressed = Vec::new(); + decoder + .read_to_end(&mut decompressed) + .change_context(TrustedServerError::Proxy { + message: "Failed to decompress deflate".to_string(), + })?; + + log::info!( + "[Deflate->None] Decompressed size: {} bytes", + decompressed.len() + ); + + // Process the decompressed content + let processed = self + .processor + .process_chunk(&decompressed, true) + .change_context(TrustedServerError::Proxy { + message: "Failed to process content".to_string(), + })?; + + log::info!("[Deflate->None] Processed size: {} bytes", processed.len()); + + // Write uncompressed output + output + .write_all(&processed) + .change_context(TrustedServerError::Proxy { + message: "Failed to write output".to_string(), + })?; + + Ok(()) + } + /// Process brotli compressed stream fn process_brotli_to_brotli( &mut self, @@ -243,6 +332,48 @@ impl StreamingPipeline

{ self.process_through_compression(decoder, encoder) } + /// Process brotli compressed input to uncompressed output (decompression only) + fn process_brotli_to_none( + &mut self, + input: R, + mut output: W, + ) -> Result<(), Report> { + use brotli::Decompressor; + + // Decompress input + let mut decoder = Decompressor::new(input, 4096); + let mut decompressed = Vec::new(); + decoder + .read_to_end(&mut decompressed) + .change_context(TrustedServerError::Proxy { + message: "Failed to decompress brotli".to_string(), + })?; + + log::info!( + "[Brotli->None] Decompressed size: {} bytes", + decompressed.len() + ); + + // Process the decompressed content + let processed = self + .processor + .process_chunk(&decompressed, true) + .change_context(TrustedServerError::Proxy { + message: "Failed to process content".to_string(), + })?; + + log::info!("[Brotli->None] Processed size: {} bytes", processed.len()); + + // Write uncompressed output + output + .write_all(&processed) + .change_context(TrustedServerError::Proxy { + message: "Failed to write output".to_string(), + })?; + + Ok(()) + } + /// Generic processing through compression layers fn process_through_compression( &mut self, diff --git a/docs/RSC_HYDRATION_FINDINGS.md b/docs/RSC_HYDRATION_FINDINGS.md new file mode 100644 index 0000000..1f7c8c8 --- /dev/null +++ b/docs/RSC_HYDRATION_FINDINGS.md @@ -0,0 +1,597 @@ +# RSC Hydration URL Rewriting: Technical Findings + +## Problem Statement + +When proxying Next.js App Router sites, URL rewriting in RSC (React Server Components) payloads caused React hydration to fail. The symptom was 0 React fiber nodes after page load, indicating complete hydration failure. + +## Background: How Next.js Delivers RSC Data + +Next.js App Router uses React Server Components with a streaming "flight" protocol. RSC data is delivered to the browser via inline ` + + +``` + +The `[1, "..."]` calls contain the actual RSC payload as a JavaScript string. + +For client-side navigations, Next.js fetches Flight directly (no ` + + + +``` + +This happens because Next.js streams RSC data as it becomes available. The T-chunk header in script 10 declares 928 bytes (0x928 = 2344 decimal), but those bytes are delivered in script 11. + +### Real-World Example + +Analysis of a Next.js App Router site revealed the following cross-script pattern: + +``` +Script 59 (index 58): +- T-chunk header at position 1370: "436:T68f," +- Declares 0x68f = 1679 bytes of content +- Content starts but script ends before all bytes are delivered + +Script 60 (index 59): +- Contains continuation of T-chunk content +- Includes 5 URLs pointing to the origin host that need rewriting +- URLs at byte positions within the T-chunk span +``` + +When the Rust implementation processed each script independently: + +- Script 59: T-chunk header found, but `content_end = header_end` (0 bytes in THIS script) +- Script 60: Content processed, but no T-chunk header to update + +Result: T-chunk length remained at 0x68f while actual content changed size after URL rewriting. + +## Discovery 2: Combining Push Calls Breaks Hydration + +```javascript +// Original: 221 push calls -> 683 fibers (works) +// Combined into 1 push call: 0 fibers (broken!) +``` + +Even with identical content, consolidating all RSC into a single push call broke hydration. Next.js processes each push call incrementally, and the structure matters. + +## Discovery 3: Per-Script Streaming Processing Cannot Fix Cross-Script T-Chunks + +The streaming HTML processor (`lol_html`) processes scripts one at a time: + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ HTML Stream │ +│ │ +│ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ Process A Process B Process C │ +│ (isolated) (isolated) (isolated) │ +│ │ +│ Cannot share state between script processing! │ +└─────────────────────────────────────────────────────────────────┘ +``` + +This is a fundamental limitation: when script A declares a T-chunk that continues in script B, the streaming processor cannot: + +1. Track that script A's T-chunk is incomplete +2. Update script A's header after processing script B's URLs + +--- + +## The Solution: Two-Phase Processing + +### Phase 1: Streaming HTML Processing + +The HTML rewriter runs in a streaming pipeline (decompress → rewrite → recompress). During this phase we: + +- Rewrite standard HTML attributes (`href`, `src`, `srcset`, etc.) +- Run integration script rewriters for self-contained payloads (e.g., Pages Router `__NEXT_DATA__`) +- Leave `self.__next_f.push([1,"..."])` scripts untouched because T-chunks can span script boundaries + +### Phase 2: HTML Post-Processing (cross-script RSC) + +At end-of-document, the Next.js integration rewrites cross-script T-chunks **without a second HTML parse**: + +1. During the initial `lol_html` pass, `NextJsRscPlaceholderRewriter` replaces each `__next_f.push([1,"..."])` payload string with a placeholder token and records the original payloads in `IntegrationDocumentState`. +2. `NextJsHtmlPostProcessor` rewrites the recorded payload strings using the marker-based cross-script algorithm (combine → rewrite → split). +3. `NextJsHtmlPostProcessor` substitutes the placeholders in the final HTML with the rewritten payload strings. + +This phase is gated by `IntegrationHtmlPostProcessor::should_process` checking whether any RSC payloads were recorded, so non‑Next.js pages do not pay the post-processing cost ([html_post_process.rs:41](crates/common/src/integrations/nextjs/html_post_process.rs#L41)). + +### Marker-Based Cross-Script Processing + +#### Step 1: Combine Scripts with Markers + +Concatenate all RSC push payload strings using a marker delimiter that cannot appear in valid JSON/RSC content. + +The marker `\x00SPLIT\x00` is chosen because: + +- Contains null byte (`\x00`) which cannot appear in valid JSON/RSC content +- Easily identifiable for splitting +- Won't be confused with any escape sequence + +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11) and combine/split logic in [rsc.rs:324](crates/common/src/integrations/nextjs/rsc.rs#L324) + +#### Step 2: Find T-Chunks Across Combined Content + +Scan the combined stream for `ID:T,` headers, then consume exactly `hex_length` unescaped bytes to find the T-chunk boundary. + +The key insight: markers don't count toward byte consumption. When a T-chunk declares 1679 bytes, we consume 1679 bytes of actual content, skipping over any markers we encounter. + +**Implementation:** T-chunk discovery at [rsc.rs:202](crates/common/src/integrations/nextjs/rsc.rs#L202) with marker-aware escape sequence iterator at [rsc.rs:72](crates/common/src/integrations/nextjs/rsc.rs#L72) + +#### Step 3: Rewrite URLs and Recalculate Lengths + +For each `T` chunk: + +1. Rewrite URLs in the chunk content (preserving marker bytes) +2. Recalculate the unescaped byte length (excluding markers) +3. Rewrite the header to `ID:T,` + +#### Step 4: Split Back on Markers + +Split the rewritten combined content by the marker to recover per-script payload strings. + +Each resulting payload corresponds to one original script, but with: + +- URLs rewritten +- T-chunk lengths correctly recalculated across script boundaries + +--- + +## Integration Hook Architecture + +The post-processing is implemented as an integration hook, allowing other integrations to also perform HTML post-processing. + +### Trait Definition + +**Implementation:** Per-document state at [registry.rs:99](crates/common/src/integrations/registry.rs#L99), context at [registry.rs:331](crates/common/src/integrations/registry.rs#L331), and trait at [registry.rs:341](crates/common/src/integrations/registry.rs#L341) + +**Note:** `IntegrationHtmlPostProcessor::should_process` defaults to `false`, so integrations must explicitly opt in to post-processing via a cheap preflight check. + +### Registration + +**Implementation:** Next.js registers its placeholder rewriter + HTML post-processor when enabled in [mod.rs:86](crates/common/src/integrations/nextjs/mod.rs#L86) + +### Execution in HTML Processor + +**Implementation:** End-of-document post-processing wrapper at [html_processor.rs:20](crates/common/src/html_processor.rs#L20) + +--- + +## Byte Length Calculation Algorithm + +`T`-chunk lengths use the **unescaped** byte count of the payload (after decoding JavaScript string escapes). Correct handling requires: + +- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [rsc.rs:37](crates/common/src/integrations/nextjs/rsc.rs#L37) +- Counting unescaped bytes: [rsc.rs:166](crates/common/src/integrations/nextjs/rsc.rs#L166) +- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [rsc.rs:171](crates/common/src/integrations/nextjs/rsc.rs#L171) +- Marker-aware byte length calculation for cross-script processing: [rsc.rs:327](crates/common/src/integrations/nextjs/rsc.rs#L327) +- Size-limited combined payload allocation (default 10 MB, configurable via `integrations.nextjs.max_combined_payload_bytes`): [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) +- Fail-safe: if `T`-chunk parsing fails (unreasonable length or truncated content), Trusted Server skips rewriting to avoid breaking hydration: [rsc.rs:202](crates/common/src/integrations/nextjs/rsc.rs#L202) +- If the size limit is exceeded and cross-script T-chunks are present, Trusted Server skips rewriting rather than risk breaking hydration: [rsc.rs:410](crates/common/src/integrations/nextjs/rsc.rs#L410) + +--- + +## URL Rewriting Patterns + +The solution handles multiple URL formats in RSC content: + +| Pattern | Example | In RSC String | +| -------------------- | ------------------- | ------------------- | +| Full HTTPS | `https://host/path` | `https://host/path` | +| Full HTTP | `http://host/path` | `http://host/path` | +| Protocol-relative | `//host/path` | `//host/path` | +| JSON-escaped slashes | `//host/path` | `\\/\\/host/path` | +| Double-escaped | `\\/\\/host` | `\\\\/\\\\/host` | +| Quad-escaped | `\\\\/\\\\/host` | `\\\\\\\\//host` | + +### Regex Pattern + +**Implementation:** Regex-based rewriting in [shared.rs:62](crates/common/src/integrations/nextjs/shared.rs#L62) + +This pattern handles: + +- Optional scheme (`https?`)? +- Optional colon (`:`)? +- Multiple escape levels for slashes +- The escaped origin hostname + +--- + +## Complete Processing Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ HTML Response from Origin │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 1: HTML Rewrite (lol_html) │ +│ │ +│ - Rewrite HTML attributes (href/src/etc.) │ +│ - Rewrite Pages Router data (`__NEXT_DATA__`) │ +│ - For App Router RSC push scripts (`__next_f.push([1,\"...\"])`): │ +│ * Replace payload string with placeholder token │ +│ * Record original payloads (IntegrationDocumentState) │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 2: HTML Post-Processing │ +│ (Integration Hook: NextJsHtmlPostProcessor) │ +│ │ +│ - Rewrite recorded payloads (marker-based cross-script T-chunk logic) │ +│ - Substitute placeholders with rewritten payload strings │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Final HTML Response to Client │ +│ │ +│ - All URLs rewritten to proxy host │ +│ - All T-chunk lengths correctly reflect content after URL rewriting │ +│ - Script structure preserved (same number of push calls) │ +│ - React hydration succeeds │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Test Results + +| Test Case | Result | +| --------------------------------------------------------- | ------ | +| T-chunk length shrinks (longer origin → shorter proxy) | Pass | +| T-chunk length grows (shorter origin → longer proxy) | Pass | +| Multiple T-chunks in same content | Pass | +| Escape sequences: `\n`, `\r`, `\t`, `\\`, `\"` | Pass | +| Unicode escapes: `\uHHHH` | Pass | +| Surrogate pairs: `\uD800\uDC00` | Pass | +| Hex escapes: `\xHH` | Pass | +| Various URL patterns (escaped slashes, etc.) | Pass | +| Cross-script T-chunk (header in script N, content in N+1) | Pass | +| Cross-script with multiple URLs in continuation | Pass | +| Non-T-chunk content preserved | Pass | +| HTML structure preserved after post-processing | Pass | + +### Comparison: JS v7 vs JS v8 vs Rust + +| Implementation | Approach | Fiber Count | Result | +| -------------- | ----------------------------- | ----------- | ------ | +| JS v7 | Per-script T-chunk rewriting | 0 | FAIL | +| JS v8 | Marker-based cross-script | 683 | PASS | +| Rust (final) | Two-phase with post-processor | 683 | PASS | + +### Playwright Browser Testing (December 2024) + +Automated testing with Playwright across Chrome and Firefox verified the implementation: + +**Test Setup:** + +- Fetched live HTML from a Next.js App Router site +- Applied RSC URL rewriting via the Rust post-processor +- Served rewritten HTML locally to isolate from bot detection + +**Results (both Chrome and Firefox):** + +| Metric | Value | +| ---------------------------------- | ------- | +| Hydration errors detected | 0 | +| Console errors (hydration-related) | 0 | +| Total links in page | 120 | +| Links rewritten to proxy | 120 | +| Links still pointing to origin | 0 | +| RSC push scripts present | Yes | +| `self.__next_f` entries | 223 | +| `__next` root element | Present | + +**Key Observations:** + +1. **No hydration mismatch**: React successfully hydrated without any "Text content does not match" or "Hydration failed" errors +2. **Complete URL rewriting**: All 120 navigation links correctly point to the proxy host +3. **RSC data preserved**: All 223 RSC Flight entries present in `self.__next_f` array +4. **Cross-browser compatibility**: Identical behavior in Chrome (Chromium) and Firefox + +--- + +## Compression Pipeline with Post-Processing + +Post-processing requires access to uncompressed UTF‑8 HTML, but the trusted server still preserves the origin `Content-Encoding` on the wire. + +End-to-end flow: + +1. `StreamingPipeline` decompresses the origin body based on `Content-Encoding` +2. The HTML processor runs `lol_html` rewriting and (optionally) integration post-processors on the complete HTML +3. `StreamingPipeline` recompresses to the original encoding + +Because post-processing runs inside the HTML processor (before recompression), `publisher.rs` does not need to special-case compression for integrations. + +**Implementation:** Post-processing entry point at [html_processor.rs:20](crates/common/src/html_processor.rs#L20) + +--- + +## Implementation Files + +| File | Purpose | +| ------------------------------------------------------------ | -------------------------------------------------------- | +| `crates/common/src/integrations/nextjs/mod.rs` | Next.js integration config + registration | +| `crates/common/src/integrations/nextjs/html_post_process.rs` | HTML post-processing for cross-script RSC | +| `crates/common/src/integrations/nextjs/rsc_placeholders.rs` | RSC placeholder insertion + payload capture (App Router) | +| `crates/common/src/integrations/nextjs/rsc.rs` | RSC T-chunk parsing + URL rewriting | +| `crates/common/src/integrations/nextjs/script_rewriter.rs` | Script rewrites (`__NEXT_DATA__`) | +| `crates/common/src/integrations/nextjs/shared.rs` | Shared regex patterns + payload parsing utilities | +| `crates/common/src/rsc_flight.rs` | Flight response rewriting (`text/x-component`) | +| `crates/common/src/integrations/registry.rs` | Integration traits + `IntegrationDocumentState` | +| `crates/common/src/integrations/mod.rs` | Module exports | +| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | +| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | +| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | + +### Key Functions (Next.js integration) + +| Symbol | Location | Purpose | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------------------- | +| `NextJsRscPlaceholderRewriter` | [rsc_placeholders.rs:52](crates/common/src/integrations/nextjs/rsc_placeholders.rs#L52) | Replace RSC payload strings with placeholders + record originals | +| `NextJsHtmlPostProcessor::post_process` | [html_post_process.rs:52](crates/common/src/integrations/nextjs/html_post_process.rs#L52) | Rewrite recorded payloads + substitute placeholders | +| `substitute_rsc_payload_placeholders` | [html_post_process.rs:116](crates/common/src/integrations/nextjs/html_post_process.rs#L116) | Substitute placeholder tokens in HTML | +| `IntegrationDocumentState` | [registry.rs:99](crates/common/src/integrations/registry.rs#L99) | Per-document state shared across phases | +| `EscapeSequenceIter` | [rsc.rs:37](crates/common/src/integrations/nextjs/rsc.rs#L37) | Shared iterator for escape sequence parsing | +| `TChunkInfo` | [rsc.rs:190](crates/common/src/integrations/nextjs/rsc.rs#L190) | T-chunk position info (stores `id_end` position, not String) | +| `calculate_unescaped_byte_length` | [rsc.rs:166](crates/common/src/integrations/nextjs/rsc.rs#L166) | Count unescaped bytes with escape handling | +| `consume_unescaped_bytes` | [rsc.rs:171](crates/common/src/integrations/nextjs/rsc.rs#L171) | Advance through string consuming N bytes | +| `find_tchunks` | [rsc.rs:260](crates/common/src/integrations/nextjs/rsc.rs#L260) | Find T-chunks in a single payload | +| `RscUrlRewriter` | [shared.rs:62](crates/common/src/integrations/nextjs/shared.rs#L62) | Regex URL rewriting for RSC payloads | +| `UrlRewriter` (script) | [script_rewriter.rs:91](crates/common/src/integrations/nextjs/script_rewriter.rs#L91) | Attribute-specific URL rewriting for `__NEXT_DATA__` (combined regex) | +| `rewrite_rsc_tchunks_with_rewriter` | [rsc.rs:272](crates/common/src/integrations/nextjs/rsc.rs#L272) | Single-payload T-chunk processing | +| `calculate_unescaped_byte_length_skip_markers` | [rsc.rs:314](crates/common/src/integrations/nextjs/rsc.rs#L314) | Count unescaped bytes, excluding markers | +| `find_tchunks_with_markers` | [rsc.rs:264](crates/common/src/integrations/nextjs/rsc.rs#L264) | Find T-chunks in marker-combined content | +| `rewrite_rsc_scripts_combined` | [rsc.rs:321](crates/common/src/integrations/nextjs/rsc.rs#L321) | Cross-script T-chunk processing | +| `find_rsc_push_scripts` | [html_post_process.rs:171](crates/common/src/integrations/nextjs/html_post_process.rs#L171) | (Deprecated) Find RSC scripts in HTML | +| `post_process_rsc_html_in_place` | [html_post_process.rs:287](crates/common/src/integrations/nextjs/html_post_process.rs#L287) | (Deprecated) Full HTML scan + rewrite | + +--- + +## Limitations + +### Very Long Proxy URLs + +If the proxy URL is significantly longer than the original, T-chunk content grows substantially. This is handled correctly (the hex length is recalculated), but it may affect: + +- Response size +- Streaming behavior if scripts become much larger + +### Performance Considerations + +The post-processing phase requires: + +1. Placeholder insertion during the initial `lol_html` pass (payload capture) +2. Combining payloads (memory allocation) +3. Regex matching for T-chunks +4. One pass placeholder substitution over the final HTML string + +For typical pages with 100-300 RSC scripts, this adds ~1-5ms to processing time. + +### Edge Cases Not Handled + +- Malformed RSC content (missing closing quotes, invalid hex) +- Nested script tags (shouldn't occur in valid HTML) +- Non-UTF8 encoded pages (requires UTF-8) + +--- + +## Deconstruction and Reconstruction Logic + +The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main runtime entry point is `NextJsHtmlPostProcessor::post_process()` at [html_post_process.rs:52](crates/common/src/integrations/nextjs/html_post_process.rs#L52), operating on payloads captured during phase 1 by `NextJsRscPlaceholderRewriter` ([rsc_placeholders.rs:52](crates/common/src/integrations/nextjs/rsc_placeholders.rs#L52)). + +### Step 1: Capture RSC Payloads (placeholders) + +During the initial HTML rewrite pass, replace each `self.__next_f.push([1, "..."])` payload string with a placeholder token and record the original payload strings in `IntegrationDocumentState`. + +**Implementation:** `NextJsRscPlaceholderRewriter::rewrite()` at [rsc_placeholders.rs:71](crates/common/src/integrations/nextjs/rsc_placeholders.rs#L71) and `IntegrationDocumentState` at [registry.rs:99](crates/common/src/integrations/registry.rs#L99) + +### Step 2: Combine Payloads with Markers + +Join all payloads with a marker string (`\x00SPLIT\x00`) that cannot appear in valid JSON/RSC content. This allows T-chunks to be processed across script boundaries while preserving the ability to split back later. + +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11), combining logic in `rewrite_rsc_scripts_combined()` at [rsc.rs:324](crates/common/src/integrations/nextjs/rsc.rs#L324) + +### Step 3: Find T-Chunks Across Combined Content + +Parse T-chunk headers (`ID:T,`) and consume exactly the declared number of unescaped bytes, skipping over markers. + +**Implementation:** `find_tchunks_with_markers()` at [rsc.rs:267](crates/common/src/integrations/nextjs/rsc.rs#L267), using `EscapeSequenceIter::from_position_with_marker()` at [rsc.rs:72](crates/common/src/integrations/nextjs/rsc.rs#L72) + +### Step 4: Rewrite URLs in T-Chunk Content + +Rewrite all URL patterns in the T-chunk content: + +- `https://origin.example.com/path` → `http://proxy.example.com/path` +- `//origin.example.com/path` → `//proxy.example.com/path` +- `\\/\\/origin.example.com` → `\\/\\/proxy.example.com` (JSON-escaped) +- `\\\\//origin.example.com` → `\\\\//proxy.example.com` (double-escaped) + +**Implementation:** `RscUrlRewriter::rewrite()` at [shared.rs:91](crates/common/src/integrations/nextjs/shared.rs#L91) + +### Step 5: Recalculate T-Chunk Length + +Calculate the new unescaped byte length (excluding markers) and update the T-chunk header with the new hex length. + +**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [rsc.rs:317](crates/common/src/integrations/nextjs/rsc.rs#L317) + +### Step 6: Split Back on Markers + +Split the combined rewritten content back into individual payloads on the marker boundaries. Each payload corresponds to one original script, with T-chunk lengths now correct across script boundaries. + +**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [rsc.rs:324](crates/common/src/integrations/nextjs/rsc.rs#L324) + +### Step 7: Reconstruct HTML + +Substitute placeholder tokens in the final HTML with the rewritten payload strings (no HTML re-parse). + +**Implementation:** `substitute_rsc_payload_placeholders()` at [html_post_process.rs:116](crates/common/src/integrations/nextjs/html_post_process.rs#L116) + +### Visual Example + +``` +BEFORE (2 scripts, T-chunk spans both): +┌──────────────────────────────────────────────────────────────────┐ +│ Script 1: self.__next_f.push([1,"11:null\n1a:T68f,"]) │ +│ └─ T-chunk header: 1a:T68f (1679 bytes declared) │ +├──────────────────────────────────────────────────────────────────┤ +│ Script 2: self.__next_f.push([1,"{\"url\":\"https://origin...."])│ +│ └─ T-chunk content continues here (1679 bytes total) │ +└──────────────────────────────────────────────────────────────────┘ + +COMBINED (with marker): +"11:null\n1a:T68f,\x00SPLIT\x00{\"url\":\"https://origin.example.com/...\"}" + ^^^^^^^^^^ marker (not counted in byte length) + +AFTER URL REWRITE: +"11:null\n1a:T652,\x00SPLIT\x00{\"url\":\"http://proxy.example.com/...\"}" + ^^^ new hex length (shorter URL = smaller length) + +SPLIT BACK: +┌──────────────────────────────────────────────────────────────────┐ +│ Script 1: self.__next_f.push([1,"11:null\n1a:T652,"]) │ +│ └─ Updated T-chunk header with correct length │ +├──────────────────────────────────────────────────────────────────┤ +│ Script 2: self.__next_f.push([1,"{\"url\":\"http://proxy.exa..."])│ +│ └─ Rewritten URLs in content │ +└──────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Comparison: Old vs New Approach + +| Aspect | Old (Whitespace Padding) | New (T-Chunk Length Recalculation) | +| --------------------- | ---------------------------- | ------------------------------------------ | +| T-chunk handling | Broken - lengths not updated | Correct - lengths recalculated | +| URL length change | Limited to shorter URLs | Any length change supported | +| Escape sequences | Not properly counted | Fully supported | +| Cross-script T-chunks | Not handled | Handled via post-processing | +| Implementation | Simple regex replace | Full T-chunk parsing + post-processing | +| Architecture | Hardcoded in processor | Integration hook pattern | +| Extensibility | None | Other integrations can add post-processors | + +--- + +## Conclusion + +RSC hydration requires **correct T-chunk byte lengths**. The trusted server solves this with two stages: + +### Stage 1: Streaming HTML rewrite + +- Run `lol_html` rewriting (attributes + integration script rewriters) +- Skip `__next_f.push` payload scripts (handled in stage 2) + +### Stage 2: End-of-document post-processing (cross-script) + +- After streaming completes for the full HTML document +- Combine scripts with markers +- Recalculate T-chunk lengths across boundaries +- Rewrite URLs in RSC payloads safely across script boundaries + +The key insights are: + +1. **T-chunk lengths must match content**: The RSC parser uses declared lengths to navigate +2. **T-chunks can span scripts**: Next.js streaming splits content arbitrarily +3. **Markers enable cross-script processing**: Combine, process, split back +4. **Integration hooks enable extensibility**: Other integrations can add post-processors + +--- + +## References + +- React Flight Protocol: Internal React implementation for RSC streaming: https://github.com/vercel/next.js/tree/v14.2.35 +- Next.js App Router: https://nextjs.org/docs/app +- lol_html: https://github.com/nicksrandall/lol-html (streaming HTML rewriter) +- Implementation: `crates/common/src/integrations/nextjs/mod.rs` and `crates/common/src/integrations/nextjs/` diff --git a/docs/integration_guide.md b/docs/integration_guide.md index f90b7e8..f615827 100644 --- a/docs/integration_guide.md +++ b/docs/integration_guide.md @@ -133,19 +133,19 @@ impl IntegrationProxy for MyIntegration { } ``` -**Recommended:** Use the provided helper methods to automatically namespace your routes under -`/integrations/{integration_name()}/`. Available helpers: `get()`, `post()`, `put()`, `delete()`, -and `patch()`. This lets you define routes with just their relative paths (e.g., `self.post("/auction")` +**Recommended:** Use the provided helper methods to automatically namespace your routes under +`/integrations/{integration_name()}/`. Available helpers: `get()`, `post()`, `put()`, `delete()`, +and `patch()`. This lets you define routes with just their relative paths (e.g., `self.post("/auction")` becomes `"/integrations/my_integration/auction"`). You can also define routes manually using -`IntegrationEndpoint::get()` / `IntegrationEndpoint::post()` / etc. for backwards compatibility or +`IntegrationEndpoint::get()` / `IntegrationEndpoint::post()` / etc. for backwards compatibility or special cases. -Routes are matched verbatim in `crates/fastly/src/main.rs`, so stick to stable paths and +Routes are matched verbatim in `crates/fastly/src/main.rs`, so stick to stable paths and register whichever HTTP methods you need. **New integrations should namespace their routes under -`/integrations/{INTEGRATION_NAME}/`** using the helper methods (`self.get()`, `self.post()`, -`self.put()`, `self.delete()`, `self.patch()`) for consistency, but you can define routes manually +`/integrations/{INTEGRATION_NAME}/`** using the helper methods (`self.get()`, `self.post()`, +`self.put()`, `self.delete()`, `self.patch()`) for consistency, but you can define routes manually if needed (e.g., for backwards compatibility). -The shared context already injects Trusted Server logging, headers, +The shared context already injects Trusted Server logging, headers, and error handling; the handler only needs to deserialize the request, call the upstream endpoint, and stamp integration-specific headers. @@ -295,9 +295,9 @@ time. Two built-in integrations demonstrate how the framework pieces fit together: -| Integration | Purpose | Key files | -| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | -| `testlight` | Sample partner stub showing request proxying, attribute rewrites, and asset injection. | `crates/common/src/integrations/testlight.rs`, `crates/js/lib/src/integrations/testlight.ts` | +| Integration | Purpose | Key files | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | +| `testlight` | Sample partner stub showing request proxying, attribute rewrites, and asset injection. | `crates/common/src/integrations/testlight.rs`, `crates/js/lib/src/integrations/testlight.ts` | | `prebid` | Production Prebid Server bridge that owns `/first-party/ad` & `/third-party/ad`, injects synthetic IDs, rewrites creatives/notification URLs, and removes publisher-supplied Prebid scripts because the shim already ships in the unified TSJS build. | `crates/common/src/integrations/prebid.rs`, `crates/js/lib/src/ext/prebidjs.ts` | ### Example: Prebid integration diff --git a/trusted-server.toml b/trusted-server.toml index 6da5e63..1eee922 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -44,7 +44,9 @@ debug = false [integrations.nextjs] enabled = false -rewrite_attributes = ["href", "link", "url"] +rewrite_attributes = ["href", "link", "siteBaseUrl", "siteProductionDomain", "url"] +# Maximum combined payload size for cross-script RSC processing (bytes). Default is 10 MB. +max_combined_payload_bytes = 10485760 [integrations.testlight] endpoint = "https://testlight.example/openrtb2/auction"