Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions crates/trusted-server-core/src/html_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,6 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso
}),
];

let has_script_rewriters = !script_rewriters.is_empty();
for script_rewriter in script_rewriters {
let selector = script_rewriter.selector();
let rewriter = script_rewriter.clone();
Expand Down Expand Up @@ -493,16 +492,7 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso
..RewriterSettings::default()
};

// Use buffered mode when script rewriters are registered. lol_html fragments
// text nodes across input chunk boundaries, breaking rewriters that expect
// complete text (e.g., __NEXT_DATA__, GTM). Buffered mode feeds the entire
// document in one write() call, preserving text node integrity.
// Phase 3 will make rewriters fragment-safe, enabling streaming for all configs.
let inner = if has_script_rewriters {
HtmlRewriterAdapter::new_buffered(rewriter_settings)
} else {
HtmlRewriterAdapter::new(rewriter_settings)
};
let inner = HtmlRewriterAdapter::new(rewriter_settings);

HtmlWithPostProcessing {
inner,
Expand Down
195 changes: 190 additions & 5 deletions crates/trusted-server-core/src/integrations/google_tag_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
//! | `GET/POST` | `.../collect` | Proxies GA analytics beacons |
//! | `GET/POST` | `.../g/collect` | Proxies GA4 analytics beacons |

use std::sync::Arc;
use std::sync::{Arc, Mutex};

use async_trait::async_trait;
use error_stack::{Report, ResultExt};
Expand Down Expand Up @@ -133,11 +133,17 @@ fn validate_container_id(container_id: &str) -> Result<(), validator::Validation

pub struct GoogleTagManagerIntegration {
config: GoogleTagManagerConfig,
/// Accumulates text fragments when `lol_html` splits a text node across
/// chunk boundaries. Drained on `is_last_in_text_node`.
accumulated_text: Mutex<String>,
}

impl GoogleTagManagerIntegration {
fn new(config: GoogleTagManagerConfig) -> Arc<Self> {
Arc::new(Self { config })
Arc::new(Self {
config,
accumulated_text: Mutex::new(String::new()),
})
}

fn error(message: impl Into<String>) -> TrustedServerError {
Expand Down Expand Up @@ -481,14 +487,40 @@ impl IntegrationScriptRewriter for GoogleTagManagerIntegration {
"script" // Match all scripts to find inline GTM snippets
}

fn rewrite(&self, content: &str, _ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction {
fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction {
let mut buf = self
.accumulated_text
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);

if !ctx.is_last_in_text_node {
// Intermediate fragment — accumulate and suppress output.
buf.push_str(content);
return ScriptRewriteAction::RemoveNode;
}

// Last fragment. Determine the full content to inspect.
let full_content;
let text = if buf.is_empty() {
content
} else {
buf.push_str(content);
full_content = std::mem::take(&mut *buf);
&full_content
};

// Look for the GTM snippet pattern.
// Standard snippet contains: "googletagmanager.com/gtm.js"
// Note: analytics.google.com is intentionally excluded — gtag.js stores
// that domain as a bare string and constructs URLs dynamically, so
// rewriting it in scripts produces broken URLs.
if content.contains("googletagmanager.com") || content.contains("google-analytics.com") {
return ScriptRewriteAction::replace(Self::rewrite_gtm_urls(content));
if text.contains("googletagmanager.com") || text.contains("google-analytics.com") {
return ScriptRewriteAction::replace(Self::rewrite_gtm_urls(text));
}

// No GTM content — if we accumulated fragments, emit them unchanged.
if text.len() != content.len() {
return ScriptRewriteAction::replace(text.to_string());
}

ScriptRewriteAction::keep()
Expand Down Expand Up @@ -1631,4 +1663,157 @@ container_id = "GTM-DEFAULT"
other => panic!("Expected Integration error, got {:?}", other),
}
}

#[test]
fn fragmented_gtm_snippet_is_accumulated_and_rewritten() {
let config = GoogleTagManagerConfig {
enabled: true,
container_id: "GTM-FRAG1".to_string(),
upstream_url: "https://www.googletagmanager.com".to_string(),
cache_max_age: default_cache_max_age(),
max_beacon_body_size: default_max_beacon_body_size(),
};
let integration = GoogleTagManagerIntegration::new(config);

let document_state = IntegrationDocumentState::default();

// Simulate lol_html splitting the GTM snippet mid-domain.
let fragment1 = r#"(function(w,d,s,l,i){j.src='https://www.google"#;
let fragment2 = r#"tagmanager.com/gtm.js?id='+i;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-FRAG1');"#;

let ctx_intermediate = IntegrationScriptContext {
selector: "script",
request_host: "publisher.example.com",
request_scheme: "https",
origin_host: "origin.example.com",
is_last_in_text_node: false,
document_state: &document_state,
};
let ctx_last = IntegrationScriptContext {
is_last_in_text_node: true,
..ctx_intermediate
};

// Intermediate fragment: should be suppressed.
let action1 =
IntegrationScriptRewriter::rewrite(&*integration, fragment1, &ctx_intermediate);
assert_eq!(
action1,
ScriptRewriteAction::RemoveNode,
"should suppress intermediate fragment"
);

// Last fragment: should emit full rewritten content.
let action2 = IntegrationScriptRewriter::rewrite(&*integration, fragment2, &ctx_last);
match action2 {
ScriptRewriteAction::Replace(rewritten) => {
assert!(
rewritten.contains("/integrations/google_tag_manager/gtm.js"),
"should rewrite GTM URL. Got: {rewritten}"
);
assert!(
!rewritten.contains("googletagmanager.com"),
"should not contain original GTM domain. Got: {rewritten}"
);
}
other => panic!("expected Replace for fragmented GTM, got {other:?}"),
}
}

#[test]
fn non_gtm_fragmented_script_is_passed_through() {
let config = GoogleTagManagerConfig {
enabled: true,
container_id: "GTM-PASS1".to_string(),
upstream_url: "https://www.googletagmanager.com".to_string(),
cache_max_age: default_cache_max_age(),
max_beacon_body_size: default_max_beacon_body_size(),
};
let integration = GoogleTagManagerIntegration::new(config);

let document_state = IntegrationDocumentState::default();

let fragment1 = "console.log('hel";
let fragment2 = "lo world');";

let ctx_intermediate = IntegrationScriptContext {
selector: "script",
request_host: "publisher.example.com",
request_scheme: "https",
origin_host: "origin.example.com",
is_last_in_text_node: false,
document_state: &document_state,
};
let ctx_last = IntegrationScriptContext {
is_last_in_text_node: true,
..ctx_intermediate
};

let action1 =
IntegrationScriptRewriter::rewrite(&*integration, fragment1, &ctx_intermediate);
assert_eq!(
action1,
ScriptRewriteAction::RemoveNode,
"should suppress intermediate"
);

// Last fragment: should emit full unchanged content since it's not GTM.
let action2 = IntegrationScriptRewriter::rewrite(&*integration, fragment2, &ctx_last);
match action2 {
ScriptRewriteAction::Replace(content) => {
assert_eq!(
content, "console.log('hello world');",
"should emit full accumulated non-GTM content"
);
}
other => panic!("expected Replace with passthrough, got {other:?}"),
}
}

/// Regression test: with a small chunk size, `lol_html` fragments the
/// inline GTM script text node. The rewriter must accumulate fragments
/// and produce correct output through the full HTML pipeline.
#[test]
fn small_chunk_gtm_rewrite_survives_fragmentation() {
let mut settings = make_settings();
settings
.integrations
.insert_config(
"google_tag_manager",
&serde_json::json!({
"enabled": true,
"container_id": "GTM-SMALL1"
}),
)
.expect("should update config");

let registry = IntegrationRegistry::new(&settings).expect("should create registry");
let config = config_from_settings(&settings, &registry);
let processor = create_html_processor(config);

// Use a very small chunk size to force fragmentation mid-domain.
let pipeline_config = PipelineConfig {
input_compression: Compression::None,
output_compression: Compression::None,
chunk_size: 32,
};
let mut pipeline = StreamingPipeline::new(pipeline_config, processor);

let html_input = r#"<html><head><script>j.src='https://www.googletagmanager.com/gtm.js?id=GTM-SMALL1';</script></head><body></body></html>"#;

let mut output = Vec::new();
pipeline
.process(Cursor::new(html_input.as_bytes()), &mut output)
.expect("should process with small chunks");
let processed = String::from_utf8_lossy(&output);

assert!(
processed.contains("/integrations/google_tag_manager/gtm.js"),
"should rewrite fragmented GTM URL. Got: {processed}"
);
assert!(
!processed.contains("googletagmanager.com"),
"should not contain original GTM domain. Got: {processed}"
);
}
}
51 changes: 51 additions & 0 deletions crates/trusted-server-core/src/integrations/nextjs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,4 +494,55 @@ mod tests {
final_html
);
}

/// Regression test: with a small chunk size, `lol_html` fragments the
/// `__NEXT_DATA__` text node across chunks. The rewriter must accumulate
/// fragments and produce correct output.
#[test]
fn small_chunk_next_data_rewrite_survives_fragmentation() {
// Build a __NEXT_DATA__ payload large enough to cross a 32-byte chunk boundary.
let html = r#"<html><body><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"href":"https://origin.example.com/reviews","title":"Hello World"}}}</script></body></html>"#;

let mut settings = create_test_settings();
settings
.integrations
.insert_config(
"nextjs",
&json!({
"enabled": true,
"rewrite_attributes": ["href", "link", "url"],
}),
)
.expect("should update nextjs config");
let registry = IntegrationRegistry::new(&settings).expect("should create registry");
let config = config_from_settings(&settings, &registry);
let processor = create_html_processor(config);

// Use a very small chunk size to force fragmentation.
let pipeline_config = PipelineConfig {
input_compression: Compression::None,
output_compression: Compression::None,
chunk_size: 32,
};
let mut pipeline = StreamingPipeline::new(pipeline_config, processor);

let mut output = Vec::new();
pipeline
.process(Cursor::new(html.as_bytes()), &mut output)
.expect("should process with small chunks");

let processed = String::from_utf8_lossy(&output);
assert!(
processed.contains("test.example.com") && processed.contains("/reviews"),
"should rewrite fragmented __NEXT_DATA__ href. Got: {processed}"
);
assert!(
!processed.contains("origin.example.com/reviews"),
"should not contain original origin href. Got: {processed}"
);
assert!(
processed.contains("Hello World"),
"should preserve non-URL content. Got: {processed}"
);
}
}
Loading
Loading