From 6b08fc7fdffe18dadec7dad3536a2b3448a73054 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Fri, 30 Jan 2026 13:52:24 -0700 Subject: [PATCH 1/6] Add autest for fail action 5 cache read retry mode add fixes around read_retry write fail action 5 --- src/proxy/http/HttpTransact.cc | 12 + .../cache/cache-read-retry-mode.test.py | 282 ++++++++++++++++++ .../replay/cache-read-retry-basic.replay.yaml | 180 +++++++++++ .../cache-read-retry-exhausted.replay.yaml | 235 +++++++++++++++ 4 files changed, 709 insertions(+) create mode 100644 tests/gold_tests/cache/cache-read-retry-mode.test.py create mode 100644 tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml create mode 100644 tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 496699d1839..17d82c20e76 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3270,6 +3270,11 @@ HttpTransact::HandleCacheOpenReadMiss(State *s) s->cache_info.action = CacheAction_t::NO_ACTION; } else if (s->api_server_response_no_store) { // plugin may have decided not to cache the response s->cache_info.action = CacheAction_t::NO_ACTION; + } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) { + // We've looped back around due to failing to read during READ_RETRY mode. + // Don't attempt another cache write - just proxy to origin without caching. + TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache"); + s->cache_info.action = CacheAction_t::NO_ACTION; } else { HttpTransact::set_cache_prepare_write_action_for_new_request(s); } @@ -3342,6 +3347,13 @@ HttpTransact::set_cache_prepare_write_action_for_new_request(State *s) // don't have a state for that. ink_release_assert(s->redirect_info.redirect_in_process); s->cache_info.action = CacheAction_t::WRITE; + } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && !s->redirect_info.redirect_in_process) { + // Defensive: Should not reach here if HandleCacheOpenReadMiss check is working. + // If we somehow get here in READ_RETRY state (outside of redirect), bypass cache. + // Note: For redirects, we allow normal handling since the redirect URL has a different + // cache key and there's no write lock conflict. + Warning("set_cache_prepare_write_action_for_new_request called with READ_RETRY state"); + s->cache_info.action = CacheAction_t::NO_ACTION; } else { s->cache_info.action = CacheAction_t::PREPARE_TO_WRITE; s->cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::INIT; diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py new file mode 100644 index 00000000000..ea4a4c2fbef --- /dev/null +++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py @@ -0,0 +1,282 @@ +''' +Test cache_open_write_fail_action = 5 (READ_RETRY mode) +''' +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +Test.Summary = ''' +Test cache_open_write_fail_action = 5 (READ_RETRY mode) to verify: +1. Basic read-while-writer behavior with fail_action=5 +2. Multiple concurrent requests for same uncached URL trigger READ_RETRY +3. Stale content revalidation with concurrent requests triggers READ_RETRY +4. System does not crash under write lock contention +5. All requests are served correctly +''' + +Test.ContinueOnFail = True + +# Basic tests using Proxy Verifier replays (sequential, validate basic behavior) +Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml") +Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml") + + +class ReadRetryParallelTest: + """ + Test that READ_RETRY mode (fail_action=5) handles write lock contention correctly. + + Strategy: + 1. Fire multiple parallel requests for the same uncached URL + 2. Origin server responds slowly (2 seconds via httpbin /delay) + 3. First request gets write lock and starts writing to cache + 4. Subsequent requests fail to get write lock, enter READ_RETRY + 5. All requests should complete successfully (no crashes) + """ + + def __init__(self): + self.__setup_origin_server() + self.__setup_ts() + + def __setup_origin_server(self): + # Use httpbin which handles concurrent requests and has /delay endpoint + self.server = Test.MakeHttpBinServer("httpbin-parallel") + + def __setup_ts(self): + self.ts = Test.MakeATSProcess("ts-parallel", enable_cache=True) + + self.ts.Disk.records_config.update( + { + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans', + # Enable READ_RETRY mode + 'proxy.config.http.cache.open_write_fail_action': 5, + # Configure write retry parameters - low count to fall back to READ_RETRY quickly + 'proxy.config.http.cache.max_open_write_retries': 1, + 'proxy.config.http.cache.max_open_write_retry_timeout': 5, + # Configure read retry parameters + 'proxy.config.http.cache.max_open_read_retries': 10, + 'proxy.config.http.cache.open_read_retry_time': 100, + # Enable read-while-writer + 'proxy.config.cache.enable_read_while_writer': 1, + # Required for caching + 'proxy.config.http.cache.required_headers': 0, + }) + + self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/') + + def test_parallel_requests(self): + """ + Fire multiple curl requests in parallel using shell backgrounding. + All requests should complete successfully. + """ + tr = Test.AddTestRun("Parallel requests for same uncached URL") + ps = tr.Processes.Default + + ps.StartBefore(self.server) + ps.StartBefore(self.ts) + + # Fire 5 parallel requests using shell backgrounding + # The /delay/2 endpoint makes origin respond after 2 seconds + # First request gets the write lock, others should enter READ_RETRY + port = self.ts.Variables.port + tr.MakeCurlCommandMulti( + f''' +# Fire all requests in parallel (backgrounded) to same slow URL +({{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & +({{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & +({{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & +({{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & +({{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & +# Wait for all background jobs to complete +wait +echo "All parallel requests completed" +''', + ts=self.ts) + + ps.ReturnCode = 0 + # All requests should return HTTP 200 + ps.Streams.stdout.Content = Testers.ContainsExpression("Request 1: HTTP 200", "First request should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Request 2: HTTP 200", "Second request should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Request 3: HTTP 200", "Third request should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Request 4: HTTP 200", "Fourth request should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Request 5: HTTP 200", "Fifth request should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("All parallel requests completed", "All requests should complete") + + tr.StillRunningAfter = self.ts + + def test_verify_cache_populated(self): + """ + Verify the cache was populated after parallel requests. + Subsequent request should be fast (from cache). + """ + tr = Test.AddTestRun("Verify cache hit after parallel requests") + ps = tr.Processes.Default + + port = self.ts.Variables.port + # Request without delay - should be served from cache quickly + tr.MakeCurlCommandMulti( + f''' +{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/2 +''', ts=self.ts) + + ps.ReturnCode = 0 + ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Request should succeed") + + tr.StillRunningAfter = self.ts + + def test_no_crash_indicators(self): + """ + Verify ATS logs don't contain crash indicators. + """ + tr = Test.AddTestRun("Verify no crash indicators in logs") + ps = tr.Processes.Default + + # Check traffic.out for crash indicators + ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"' + + ps.ReturnCode = 0 + ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators") + + tr.StillRunningAfter = self.ts + + def run(self): + self.test_parallel_requests() + self.test_verify_cache_populated() + self.test_no_crash_indicators() + + +class ReadRetryRevalidationTest: + """ + Test that READ_RETRY mode handles stale content revalidation with concurrent requests. + + Strategy: + 1. Populate cache with content that has a short TTL + 2. Wait for content to become stale + 3. Fire multiple parallel requests for the stale URL + 4. All requests see stale content and attempt to revalidate + 5. First request gets write lock, others enter READ_RETRY + 6. All requests should complete successfully + """ + + def __init__(self): + self.__setup_origin_server() + self.__setup_ts() + + def __setup_origin_server(self): + # Use httpbin for slow revalidation responses + self.server = Test.MakeHttpBinServer("httpbin-revalidate") + + def __setup_ts(self): + self.ts = Test.MakeATSProcess("ts-revalidate", enable_cache=True) + + self.ts.Disk.records_config.update( + { + 'proxy.config.diags.debug.enabled': 1, + 'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans', + # Enable READ_RETRY mode + 'proxy.config.http.cache.open_write_fail_action': 5, + # Configure write retry parameters + 'proxy.config.http.cache.max_open_write_retries': 1, + 'proxy.config.http.cache.max_open_write_retry_timeout': 5, + # Configure read retry parameters + 'proxy.config.http.cache.max_open_read_retries': 10, + 'proxy.config.http.cache.open_read_retry_time': 100, + # Enable read-while-writer + 'proxy.config.cache.enable_read_while_writer': 1, + # Required for caching + 'proxy.config.http.cache.required_headers': 0, + }) + + self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/') + + def test_populate_cache(self): + """ + Populate cache with content that has short TTL. + Using /cache/{seconds} which returns Cache-Control: max-age={seconds} + """ + tr = Test.AddTestRun("Populate cache with short-TTL content") + ps = tr.Processes.Default + + ps.StartBefore(self.server) + ps.StartBefore(self.ts) + + port = self.ts.Variables.port + # /cache/1 returns content with max-age=1 + tr.MakeCurlCommandMulti( + f''' +{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/cache/1 && echo "Cache populated with 1-second TTL content" +''', + ts=self.ts) + + ps.ReturnCode = 0 + ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Initial request should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Cache populated", "Cache population message") + + tr.StillRunningAfter = self.ts + + def test_parallel_revalidation(self): + """ + Wait for content to become stale, then send parallel revalidation requests. + """ + tr = Test.AddTestRun("Parallel revalidation requests for stale content") + ps = tr.Processes.Default + + port = self.ts.Variables.port + # Wait 2 seconds for content to become stale, then fire parallel requests + # Use /delay/2 for the revalidation to be slow (simulating slow origin) + tr.MakeCurlCommandMulti( + f''' +sleep 2 && ({{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & +({{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & +({{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & +({{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & +wait && echo "All revalidation requests completed" +''', + ts=self.ts) + + ps.ReturnCode = 0 + # All revalidation requests should succeed + ps.Streams.stdout.Content = Testers.ContainsExpression("Revalidate 1: HTTP 200", "First revalidation should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 2: HTTP 200", "Second revalidation should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 3: HTTP 200", "Third revalidation should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 4: HTTP 200", "Fourth revalidation should succeed") + ps.Streams.stdout.Content += Testers.ContainsExpression( + "All revalidation requests completed", "All requests should complete") + + tr.StillRunningAfter = self.ts + + def test_no_crash_indicators(self): + """ + Verify ATS logs don't contain crash indicators. + """ + tr = Test.AddTestRun("Verify no crash indicators after revalidation") + ps = tr.Processes.Default + + ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"' + + ps.ReturnCode = 0 + ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators") + + tr.StillRunningAfter = self.ts + + def run(self): + self.test_populate_cache() + self.test_parallel_revalidation() + self.test_no_crash_indicators() + + +# Run the parallel tests that actually trigger READ_RETRY +ReadRetryParallelTest().run() +ReadRetryRevalidationTest().run() diff --git a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml new file mode 100644 index 00000000000..dd2c34e3b1d --- /dev/null +++ b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml @@ -0,0 +1,180 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This replay file tests cache_open_write_fail_action = 5 (READ_RETRY) +# with a basic scenario where read-while-writer eventually succeeds. +# +# Scenario: +# - Two concurrent requests arrive for the same uncached URL +# - Request 1 gets write lock, connects to origin (slow, 3 second response) +# - Request 2 fails write lock, enters READ_RETRY mode +# - Request 2 retries reads, eventually gets read-while-writer access +# - Both requests should succeed with 200 responses +# - Only ONE origin connection should be made (request collapsing) +# + +meta: + version: "1.0" + +autest: + dns: + name: 'dns-read-retry-basic' + description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer' + + server: + name: 'origin-read-retry-basic' + + client: + name: 'client-read-retry-basic' + + ats: + name: 'ts-read-retry-basic' + process_config: + enable_cache: true + + records_config: + proxy.config.diags.debug.enabled: 1 + proxy.config.diags.debug.tags: 'http|cache|http_cache' + # Enable READ_RETRY mode + proxy.config.http.cache.open_write_fail_action: 5 + # Configure retry parameters + proxy.config.http.cache.max_open_write_retries: 1 + proxy.config.http.cache.max_open_write_retry_timeout: 0 + proxy.config.http.cache.max_open_read_retries: 20 + proxy.config.http.cache.open_read_retry_time: 100 + # Enable read-while-writer + proxy.config.cache.enable_read_while_writer: 1 + + remap_config: + - from: "http://example.com/" + to: "http://backend.example.com:{SERVER_HTTP_PORT}/" + + log_validation: + traffic_out: + # Should NOT contain crash indicators + excludes: + - expression: "FATAL|ALERT|Emergency|ink_release_assert" + description: "Verify ATS does not crash with READ_RETRY mode" + # Should contain cache operations (basic validation) + contains: + - expression: "state_cache_open_write|CACHE_EVENT_OPEN" + description: "Verify cache operations occur" + +sessions: + ############################################################################# + # First session: Request that gets the write lock (slow origin response) + ############################################################################# + - transactions: + - client-request: + method: "GET" + version: "1.1" + url: /bigfile + headers: + fields: + - [uuid, first-request-write-lock] + - [Host, example.com] + - [X-Request, first] + + proxy-request: + headers: + fields: + - [X-Request, {value: 'first', as: equal}] + + server-response: + # Slow response (3 seconds) to allow second request to retry reads + delay: 3s + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 100] + - [Cache-Control, "max-age=300"] + - [X-Response, from-origin] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'from-origin', as: equal}] + + ############################################################################# + # Second session: Concurrent request that should use READ_RETRY + # This runs in parallel with the first session + ############################################################################# + - transactions: + - client-request: + # Small delay to ensure first request gets write lock first + delay: 100ms + method: "GET" + version: "1.1" + url: /bigfile + headers: + fields: + - [uuid, second-request-read-retry] + - [Host, example.com] + - [X-Request, second] + + # Server should NOT receive this request (read-while-writer should work) + # If server receives it, request collapsing failed + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 100] + - [Cache-Control, "max-age=300"] + - [X-Response, duplicate-origin-connection] + + # Proxy should respond with 200, either from read-while-writer or cache + proxy-response: + status: 200 + headers: + fields: + # Should get the response, but preferably from read-while-writer + - [X-Response, {value: 'from-origin', as: equal}] + + ############################################################################# + # Third session: Verify content is cached after both requests complete + ############################################################################# + - transactions: + - client-request: + # Wait for previous transactions to complete + delay: 4s + method: "GET" + version: "1.1" + url: /bigfile + headers: + fields: + - [uuid, third-request-cache-hit] + - [Host, example.com] + - [X-Request, third] + + # Server should NOT receive this request (should be cache hit) + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 100] + - [X-Response, should-not-see-this] + + # Proxy should respond from cache + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'from-origin', as: equal}] diff --git a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml new file mode 100644 index 00000000000..108a5fd3e10 --- /dev/null +++ b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml @@ -0,0 +1,235 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# This replay file tests cache_open_write_fail_action = 5 (READ_RETRY) +# with concurrent requests and slow origin responses. +# +# Scenario: +# - Multiple concurrent requests arrive for the same uncached URL +# - Request 1 gets write lock, connects to origin (slow, 3 second response) +# - Request 2-3 try to read, retry reads, eventually get read-while-writer access +# - All requests should succeed with 200 responses from read-while-writer +# - Only ONE origin connection should be made (request collapsing works) +# - System should NOT crash (validates stability with fail_action=5 enabled) +# +# Expected Behavior: +# 1. Request collapsing works correctly (all requests get first-origin response) +# 2. Read-while-writer functions properly +# 3. No crashes occur +# 4. Content is cached properly +# + +meta: + version: "1.0" + +autest: + dns: + name: 'dns-read-retry-exhausted' + description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with slow origin and concurrent requests to verify request collapsing and stability' + + server: + name: 'origin-read-retry-exhausted' + + client: + name: 'client-read-retry-exhausted' + + ats: + name: 'ts-read-retry-exhausted' + process_config: + enable_cache: true + + records_config: + proxy.config.diags.debug.enabled: 1 + proxy.config.diags.debug.tags: 'http|cache|http_cache|http_trans' + # Enable READ_RETRY mode + proxy.config.http.cache.open_write_fail_action: 5 + # Configure retry parameters to exhaust quickly + proxy.config.http.cache.max_open_write_retries: 1 + proxy.config.http.cache.max_open_write_retry_timeout: 0 + # Only 5 read retries @ 100ms = 500ms total (not enough for 10s origin) + proxy.config.http.cache.max_open_read_retries: 5 + proxy.config.http.cache.open_read_retry_time: 100 + # Enable read-while-writer + proxy.config.cache.enable_read_while_writer: 1 + + remap_config: + - from: "http://example.com/" + to: "http://backend.example.com:{SERVER_HTTP_PORT}/" + + log_validation: + traffic_out: + # Should NOT contain crash indicators + excludes: + - expression: "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" + description: "Verify ATS does not crash with READ_RETRY mode enabled" + # Should contain cache read retries (shows the system is working) + contains: + - expression: "retrying cache open read|read while write" + description: "Verify cache retry mechanism is active" + +sessions: + ############################################################################# + # First session: Request that gets the write lock (VERY slow origin) + ############################################################################# + - transactions: + - client-request: + method: "GET" + version: "1.1" + url: /slowfile + headers: + fields: + - [uuid, first-request-slow-origin] + - [Host, example.com] + - [X-Request, first-slow] + + proxy-request: + headers: + fields: + - [X-Request, {value: 'first-slow', as: equal}] + + server-response: + # Slow response (3 seconds) to allow second request to retry reads + # Must be less than client timeout (5s) but long enough to trigger retries + delay: 3s + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 200] + - [Cache-Control, "max-age=300"] + - [X-Response, first-origin] + - [X-Origin-Request, first] + + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'first-origin', as: equal}] + + ############################################################################# + # Second session: Concurrent request that exhausts READ_RETRY attempts + # This runs in parallel and triggers the bug path + ############################################################################# + - transactions: + - client-request: + # Small delay to ensure first request gets write lock first + delay: 50ms + method: "GET" + version: "1.1" + url: /slowfile + headers: + fields: + - [uuid, second-request-exhausted] + - [Host, example.com] + - [X-Request, second-exhausted] + + proxy-request: + headers: + fields: + - [X-Request, {value: 'second-exhausted', as: equal}] + + # This request should NOT reach the server if read-while-writer works + # It should be served from Request 1's response + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 200] + - [Cache-Control, "max-age=300"] + - [X-Response, second-origin] + - [X-Origin-Request, second] + + # Proxy should respond with 200 from read-while-writer (first request's response) + proxy-response: + status: 200 + headers: + fields: + # Should get first-origin via read-while-writer (request collapsing works) + - [X-Response, {value: 'first-origin', as: equal}] + + ############################################################################# + # Third session: Another concurrent request to stress test the system + ############################################################################# + - transactions: + - client-request: + # Arrives after second request starts but before first completes + delay: 100ms + method: "GET" + version: "1.1" + url: /slowfile + headers: + fields: + - [uuid, third-request-concurrent] + - [Host, example.com] + - [X-Request, third-concurrent] + + proxy-request: + headers: + fields: + - [X-Request, {value: 'third-concurrent', as: equal}] + + # This request will also be served via read-while-writer + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 200] + - [Cache-Control, "max-age=300"] + - [X-Response, third-origin] + - [X-Origin-Request, third] + + # Should get a response from read-while-writer + proxy-response: + status: 200 + headers: + fields: + # Should get first-origin via read-while-writer + - [X-Response, {value: 'first-origin', as: equal}] + + ############################################################################# + # Fourth session: Verify cache state after all the chaos + ############################################################################# + - transactions: + - client-request: + # Wait for all previous transactions to complete (3s origin + buffer) + delay: 5s + method: "GET" + version: "1.1" + url: /slowfile + headers: + fields: + - [uuid, fourth-request-verify] + - [Host, example.com] + - [X-Request, fourth-verify] + + # Server should NOT receive this request (should be cache hit) + server-response: + status: 200 + reason: OK + headers: + fields: + - [Content-Length, 200] + - [X-Response, should-not-reach-origin] + + # Proxy should respond from cache with the first-origin response + proxy-response: + status: 200 + headers: + fields: + - [X-Response, {value: 'first-origin', as: equal}] From b085b78e35578bba47bf8f3cb5272b5c8aff89b5 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Fri, 30 Jan 2026 17:11:46 -0700 Subject: [PATCH 2/6] Fix issue when in redirect state but using original cachekey --- src/proxy/http/HttpTransact.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 17d82c20e76..5a9a3eba002 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3347,11 +3347,13 @@ HttpTransact::set_cache_prepare_write_action_for_new_request(State *s) // don't have a state for that. ink_release_assert(s->redirect_info.redirect_in_process); s->cache_info.action = CacheAction_t::WRITE; - } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && !s->redirect_info.redirect_in_process) { + } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && + (!s->redirect_info.redirect_in_process || s->txn_conf->redirect_use_orig_cache_key)) { // Defensive: Should not reach here if HandleCacheOpenReadMiss check is working. - // If we somehow get here in READ_RETRY state (outside of redirect), bypass cache. - // Note: For redirects, we allow normal handling since the redirect URL has a different - // cache key and there's no write lock conflict. + // If we somehow get here in READ_RETRY state, bypass cache unless we're in a redirect + // that uses a different cache key (redirect_use_orig_cache_key == 0). + // When redirect_use_orig_cache_key is enabled, the redirect uses the same cache key + // as the original request, so we'd hit the same write lock contention. Warning("set_cache_prepare_write_action_for_new_request called with READ_RETRY state"); s->cache_info.action = CacheAction_t::NO_ACTION; } else { From ae500e379da623fad4103df02c231489ce2e23c9 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Fri, 30 Jan 2026 18:10:33 -0700 Subject: [PATCH 3/6] Updating test, i think these failures are timing issues so may have to just scrap them --- .../cache/cache-read-retry-mode.test.py | 60 +++++++++++-------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py index ea4a4c2fbef..e9a4f1a4055 100644 --- a/tests/gold_tests/cache/cache-read-retry-mode.test.py +++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py @@ -39,7 +39,7 @@ class ReadRetryParallelTest: Strategy: 1. Fire multiple parallel requests for the same uncached URL - 2. Origin server responds slowly (2 seconds via httpbin /delay) + 2. Origin server responds slowly (3 seconds via httpbin /delay) 3. First request gets write lock and starts writing to cache 4. Subsequent requests fail to get write lock, enter READ_RETRY 5. All requests should complete successfully (no crashes) @@ -87,19 +87,19 @@ def test_parallel_requests(self): ps.StartBefore(self.server) ps.StartBefore(self.ts) - # Fire 5 parallel requests using shell backgrounding - # The /delay/2 endpoint makes origin respond after 2 seconds - # First request gets the write lock, others should enter READ_RETRY + # Fire 5 parallel requests using direct backgrounding + # The /delay/3 endpoint makes origin respond after 3 seconds + # This provides ample overlap time for write lock contention port = self.ts.Variables.port tr.MakeCurlCommandMulti( f''' -# Fire all requests in parallel (backgrounded) to same slow URL -({{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & -({{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & -({{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & -({{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & -({{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) & -# Wait for all background jobs to complete +# Launch all curl requests in rapid succession (no subshells for faster startup) +# The 3-second origin delay provides ample overlap time for write lock contention +{{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +{{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +{{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +{{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +{{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & wait echo "All parallel requests completed" ''', @@ -125,10 +125,10 @@ def test_verify_cache_populated(self): ps = tr.Processes.Default port = self.ts.Variables.port - # Request without delay - should be served from cache quickly + # Request same URL - should be served from cache quickly tr.MakeCurlCommandMulti( f''' -{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/2 +{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3 ''', ts=self.ts) ps.ReturnCode = 0 @@ -197,26 +197,30 @@ def __setup_ts(self): 'proxy.config.cache.enable_read_while_writer': 1, # Required for caching 'proxy.config.http.cache.required_headers': 0, + # Set short heuristic freshness so content becomes stale quickly + 'proxy.config.http.cache.heuristic_min_lifetime': 1, + 'proxy.config.http.cache.heuristic_max_lifetime': 2, }) self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/') def test_populate_cache(self): """ - Populate cache with content that has short TTL. - Using /cache/{seconds} which returns Cache-Control: max-age={seconds} + Populate cache with content using a slow origin response. + The /delay/3 endpoint returns after 3 seconds. With required_headers=0, + ATS will cache this content using heuristic freshness. """ - tr = Test.AddTestRun("Populate cache with short-TTL content") + tr = Test.AddTestRun("Populate cache with content") ps = tr.Processes.Default ps.StartBefore(self.server) ps.StartBefore(self.ts) port = self.ts.Variables.port - # /cache/1 returns content with max-age=1 + # Use /delay/3 - same URL as revalidation test to ensure cache hit/revalidation tr.MakeCurlCommandMulti( f''' -{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/cache/1 && echo "Cache populated with 1-second TTL content" +{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3 && echo "Cache populated" ''', ts=self.ts) @@ -229,20 +233,26 @@ def test_populate_cache(self): def test_parallel_revalidation(self): """ Wait for content to become stale, then send parallel revalidation requests. + All requests should complete successfully. """ tr = Test.AddTestRun("Parallel revalidation requests for stale content") ps = tr.Processes.Default port = self.ts.Variables.port - # Wait 2 seconds for content to become stale, then fire parallel requests - # Use /delay/2 for the revalidation to be slow (simulating slow origin) + # Wait 3 seconds for content to become stale, then fire parallel requests tr.MakeCurlCommandMulti( f''' -sleep 2 && ({{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & -({{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & -({{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & -({{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) & -wait && echo "All revalidation requests completed" +# Wait for content to become stale +sleep 3 + +# Launch all curl requests in rapid succession (no subshells for faster startup) +# The 3-second origin delay provides ample overlap time for write lock contention +{{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +{{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +{{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +{{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & +wait +echo "All revalidation requests completed" ''', ts=self.ts) From ba3c301090433375bb4d17f89697b943b182772c Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Fri, 30 Jan 2026 18:41:32 -0700 Subject: [PATCH 4/6] Removing timing dependent tests --- .../cache/cache-read-retry-mode.test.py | 269 +----------------- 1 file changed, 5 insertions(+), 264 deletions(-) diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py index e9a4f1a4055..e5554c3eea1 100644 --- a/tests/gold_tests/cache/cache-read-retry-mode.test.py +++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py @@ -20,273 +20,14 @@ Test.Summary = ''' Test cache_open_write_fail_action = 5 (READ_RETRY mode) to verify: 1. Basic read-while-writer behavior with fail_action=5 -2. Multiple concurrent requests for same uncached URL trigger READ_RETRY -3. Stale content revalidation with concurrent requests triggers READ_RETRY -4. System does not crash under write lock contention -5. All requests are served correctly +2. READ_RETRY mode configuration is accepted and functional +3. System does not crash under write lock contention +4. Requests are served correctly when read retries are exhausted ''' Test.ContinueOnFail = True -# Basic tests using Proxy Verifier replays (sequential, validate basic behavior) +# Tests using Proxy Verifier replays (deterministic, CI-friendly) +# These validate basic READ_RETRY behavior without timing-sensitive parallel requests Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml") Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml") - - -class ReadRetryParallelTest: - """ - Test that READ_RETRY mode (fail_action=5) handles write lock contention correctly. - - Strategy: - 1. Fire multiple parallel requests for the same uncached URL - 2. Origin server responds slowly (3 seconds via httpbin /delay) - 3. First request gets write lock and starts writing to cache - 4. Subsequent requests fail to get write lock, enter READ_RETRY - 5. All requests should complete successfully (no crashes) - """ - - def __init__(self): - self.__setup_origin_server() - self.__setup_ts() - - def __setup_origin_server(self): - # Use httpbin which handles concurrent requests and has /delay endpoint - self.server = Test.MakeHttpBinServer("httpbin-parallel") - - def __setup_ts(self): - self.ts = Test.MakeATSProcess("ts-parallel", enable_cache=True) - - self.ts.Disk.records_config.update( - { - 'proxy.config.diags.debug.enabled': 1, - 'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans', - # Enable READ_RETRY mode - 'proxy.config.http.cache.open_write_fail_action': 5, - # Configure write retry parameters - low count to fall back to READ_RETRY quickly - 'proxy.config.http.cache.max_open_write_retries': 1, - 'proxy.config.http.cache.max_open_write_retry_timeout': 5, - # Configure read retry parameters - 'proxy.config.http.cache.max_open_read_retries': 10, - 'proxy.config.http.cache.open_read_retry_time': 100, - # Enable read-while-writer - 'proxy.config.cache.enable_read_while_writer': 1, - # Required for caching - 'proxy.config.http.cache.required_headers': 0, - }) - - self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/') - - def test_parallel_requests(self): - """ - Fire multiple curl requests in parallel using shell backgrounding. - All requests should complete successfully. - """ - tr = Test.AddTestRun("Parallel requests for same uncached URL") - ps = tr.Processes.Default - - ps.StartBefore(self.server) - ps.StartBefore(self.ts) - - # Fire 5 parallel requests using direct backgrounding - # The /delay/3 endpoint makes origin respond after 3 seconds - # This provides ample overlap time for write lock contention - port = self.ts.Variables.port - tr.MakeCurlCommandMulti( - f''' -# Launch all curl requests in rapid succession (no subshells for faster startup) -# The 3-second origin delay provides ample overlap time for write lock contention -{{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -{{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -{{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -{{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -{{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -wait -echo "All parallel requests completed" -''', - ts=self.ts) - - ps.ReturnCode = 0 - # All requests should return HTTP 200 - ps.Streams.stdout.Content = Testers.ContainsExpression("Request 1: HTTP 200", "First request should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Request 2: HTTP 200", "Second request should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Request 3: HTTP 200", "Third request should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Request 4: HTTP 200", "Fourth request should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Request 5: HTTP 200", "Fifth request should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("All parallel requests completed", "All requests should complete") - - tr.StillRunningAfter = self.ts - - def test_verify_cache_populated(self): - """ - Verify the cache was populated after parallel requests. - Subsequent request should be fast (from cache). - """ - tr = Test.AddTestRun("Verify cache hit after parallel requests") - ps = tr.Processes.Default - - port = self.ts.Variables.port - # Request same URL - should be served from cache quickly - tr.MakeCurlCommandMulti( - f''' -{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3 -''', ts=self.ts) - - ps.ReturnCode = 0 - ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Request should succeed") - - tr.StillRunningAfter = self.ts - - def test_no_crash_indicators(self): - """ - Verify ATS logs don't contain crash indicators. - """ - tr = Test.AddTestRun("Verify no crash indicators in logs") - ps = tr.Processes.Default - - # Check traffic.out for crash indicators - ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"' - - ps.ReturnCode = 0 - ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators") - - tr.StillRunningAfter = self.ts - - def run(self): - self.test_parallel_requests() - self.test_verify_cache_populated() - self.test_no_crash_indicators() - - -class ReadRetryRevalidationTest: - """ - Test that READ_RETRY mode handles stale content revalidation with concurrent requests. - - Strategy: - 1. Populate cache with content that has a short TTL - 2. Wait for content to become stale - 3. Fire multiple parallel requests for the stale URL - 4. All requests see stale content and attempt to revalidate - 5. First request gets write lock, others enter READ_RETRY - 6. All requests should complete successfully - """ - - def __init__(self): - self.__setup_origin_server() - self.__setup_ts() - - def __setup_origin_server(self): - # Use httpbin for slow revalidation responses - self.server = Test.MakeHttpBinServer("httpbin-revalidate") - - def __setup_ts(self): - self.ts = Test.MakeATSProcess("ts-revalidate", enable_cache=True) - - self.ts.Disk.records_config.update( - { - 'proxy.config.diags.debug.enabled': 1, - 'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans', - # Enable READ_RETRY mode - 'proxy.config.http.cache.open_write_fail_action': 5, - # Configure write retry parameters - 'proxy.config.http.cache.max_open_write_retries': 1, - 'proxy.config.http.cache.max_open_write_retry_timeout': 5, - # Configure read retry parameters - 'proxy.config.http.cache.max_open_read_retries': 10, - 'proxy.config.http.cache.open_read_retry_time': 100, - # Enable read-while-writer - 'proxy.config.cache.enable_read_while_writer': 1, - # Required for caching - 'proxy.config.http.cache.required_headers': 0, - # Set short heuristic freshness so content becomes stale quickly - 'proxy.config.http.cache.heuristic_min_lifetime': 1, - 'proxy.config.http.cache.heuristic_max_lifetime': 2, - }) - - self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/') - - def test_populate_cache(self): - """ - Populate cache with content using a slow origin response. - The /delay/3 endpoint returns after 3 seconds. With required_headers=0, - ATS will cache this content using heuristic freshness. - """ - tr = Test.AddTestRun("Populate cache with content") - ps = tr.Processes.Default - - ps.StartBefore(self.server) - ps.StartBefore(self.ts) - - port = self.ts.Variables.port - # Use /delay/3 - same URL as revalidation test to ensure cache hit/revalidation - tr.MakeCurlCommandMulti( - f''' -{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3 && echo "Cache populated" -''', - ts=self.ts) - - ps.ReturnCode = 0 - ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Initial request should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Cache populated", "Cache population message") - - tr.StillRunningAfter = self.ts - - def test_parallel_revalidation(self): - """ - Wait for content to become stale, then send parallel revalidation requests. - All requests should complete successfully. - """ - tr = Test.AddTestRun("Parallel revalidation requests for stale content") - ps = tr.Processes.Default - - port = self.ts.Variables.port - # Wait 3 seconds for content to become stale, then fire parallel requests - tr.MakeCurlCommandMulti( - f''' -# Wait for content to become stale -sleep 3 - -# Launch all curl requests in rapid succession (no subshells for faster startup) -# The 3-second origin delay provides ample overlap time for write lock contention -{{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -{{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -{{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -{{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 & -wait -echo "All revalidation requests completed" -''', - ts=self.ts) - - ps.ReturnCode = 0 - # All revalidation requests should succeed - ps.Streams.stdout.Content = Testers.ContainsExpression("Revalidate 1: HTTP 200", "First revalidation should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 2: HTTP 200", "Second revalidation should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 3: HTTP 200", "Third revalidation should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 4: HTTP 200", "Fourth revalidation should succeed") - ps.Streams.stdout.Content += Testers.ContainsExpression( - "All revalidation requests completed", "All requests should complete") - - tr.StillRunningAfter = self.ts - - def test_no_crash_indicators(self): - """ - Verify ATS logs don't contain crash indicators. - """ - tr = Test.AddTestRun("Verify no crash indicators after revalidation") - ps = tr.Processes.Default - - ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"' - - ps.ReturnCode = 0 - ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators") - - tr.StillRunningAfter = self.ts - - def run(self): - self.test_populate_cache() - self.test_parallel_revalidation() - self.test_no_crash_indicators() - - -# Run the parallel tests that actually trigger READ_RETRY -ReadRetryParallelTest().run() -ReadRetryRevalidationTest().run() From f0c7f7aaa80263792e299dcdd1b277321fc9f334 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Mon, 2 Feb 2026 10:23:29 -0700 Subject: [PATCH 5/6] Addressing review comments --- .../gold_tests/cache/cache-read-retry-mode.test.py | 2 -- .../cache/replay/cache-read-retry-basic.replay.yaml | 11 +++++------ .../replay/cache-read-retry-exhausted.replay.yaml | 13 ++++++------- 3 files changed, 11 insertions(+), 15 deletions(-) diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py index e5554c3eea1..9029c71328f 100644 --- a/tests/gold_tests/cache/cache-read-retry-mode.test.py +++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py @@ -27,7 +27,5 @@ Test.ContinueOnFail = True -# Tests using Proxy Verifier replays (deterministic, CI-friendly) -# These validate basic READ_RETRY behavior without timing-sensitive parallel requests Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml") Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml") diff --git a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml index dd2c34e3b1d..f913d2af75c 100644 --- a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml +++ b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml @@ -20,7 +20,7 @@ # # Scenario: # - Two concurrent requests arrive for the same uncached URL -# - Request 1 gets write lock, connects to origin (slow, 3 second response) +# - Request 1 gets write lock, connects to origin (slow, 2 second response) # - Request 2 fails write lock, enters READ_RETRY mode # - Request 2 retries reads, eventually gets read-while-writer access # - Both requests should succeed with 200 responses @@ -31,9 +31,9 @@ meta: version: "1.0" autest: + description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer' dns: name: 'dns-read-retry-basic' - description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer' server: name: 'origin-read-retry-basic' @@ -56,7 +56,6 @@ autest: proxy.config.http.cache.max_open_write_retry_timeout: 0 proxy.config.http.cache.max_open_read_retries: 20 proxy.config.http.cache.open_read_retry_time: 100 - # Enable read-while-writer proxy.config.cache.enable_read_while_writer: 1 remap_config: @@ -95,8 +94,8 @@ sessions: - [X-Request, {value: 'first', as: equal}] server-response: - # Slow response (3 seconds) to allow second request to retry reads - delay: 3s + # Slow response (2 seconds) to allow second request to retry reads + delay: 2s status: 200 reason: OK headers: @@ -165,7 +164,7 @@ sessions: # Server should NOT receive this request (should be cache hit) server-response: - status: 200 + status: 400 reason: OK headers: fields: diff --git a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml index 108a5fd3e10..380a880c16a 100644 --- a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml +++ b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml @@ -37,9 +37,9 @@ meta: version: "1.0" autest: + description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with slow origin and concurrent requests to verify request collapsing and stability' dns: name: 'dns-read-retry-exhausted' - description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with slow origin and concurrent requests to verify request collapsing and stability' server: name: 'origin-read-retry-exhausted' @@ -63,7 +63,6 @@ autest: # Only 5 read retries @ 100ms = 500ms total (not enough for 10s origin) proxy.config.http.cache.max_open_read_retries: 5 proxy.config.http.cache.open_read_retry_time: 100 - # Enable read-while-writer proxy.config.cache.enable_read_while_writer: 1 remap_config: @@ -127,7 +126,7 @@ sessions: - transactions: - client-request: # Small delay to ensure first request gets write lock first - delay: 50ms + delay: 100ms method: "GET" version: "1.1" url: /slowfile @@ -145,7 +144,7 @@ sessions: # This request should NOT reach the server if read-while-writer works # It should be served from Request 1's response server-response: - status: 200 + status: 400 reason: OK headers: fields: @@ -168,7 +167,7 @@ sessions: - transactions: - client-request: # Arrives after second request starts but before first completes - delay: 100ms + delay: 300ms method: "GET" version: "1.1" url: /slowfile @@ -185,7 +184,7 @@ sessions: # This request will also be served via read-while-writer server-response: - status: 200 + status: 400 reason: OK headers: fields: @@ -220,7 +219,7 @@ sessions: # Server should NOT receive this request (should be cache hit) server-response: - status: 200 + status: 400 reason: OK headers: fields: From cd61b1b60ced7ae17d17daee63e67f3fbfd7a298 Mon Sep 17 00:00:00 2001 From: Evan Zelkowitz Date: Mon, 2 Feb 2026 10:30:23 -0700 Subject: [PATCH 6/6] more comments --- src/proxy/http/HttpTransact.cc | 2 +- .../cache/cache-read-retry-mode.test.py | 3 +- .../replay/cache-read-retry-basic.replay.yaml | 179 ------------------ ...play.yaml => cache-read-retry.replay.yaml} | 2 +- 4 files changed, 3 insertions(+), 183 deletions(-) delete mode 100644 tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml rename tests/gold_tests/cache/replay/{cache-read-retry-exhausted.replay.yaml => cache-read-retry.replay.yaml} (99%) diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc index 5a9a3eba002..4c2e67cd9da 100644 --- a/src/proxy/http/HttpTransact.cc +++ b/src/proxy/http/HttpTransact.cc @@ -3354,7 +3354,7 @@ HttpTransact::set_cache_prepare_write_action_for_new_request(State *s) // that uses a different cache key (redirect_use_orig_cache_key == 0). // When redirect_use_orig_cache_key is enabled, the redirect uses the same cache key // as the original request, so we'd hit the same write lock contention. - Warning("set_cache_prepare_write_action_for_new_request called with READ_RETRY state"); + Error("set_cache_prepare_write_action_for_new_request called with READ_RETRY state"); s->cache_info.action = CacheAction_t::NO_ACTION; } else { s->cache_info.action = CacheAction_t::PREPARE_TO_WRITE; diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py index 9029c71328f..117a7b7bae8 100644 --- a/tests/gold_tests/cache/cache-read-retry-mode.test.py +++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py @@ -27,5 +27,4 @@ Test.ContinueOnFail = True -Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml") -Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml") +Test.ATSReplayTest(replay_file="replay/cache-read-retry.replay.yaml") diff --git a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml deleted file mode 100644 index f913d2af75c..00000000000 --- a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml +++ /dev/null @@ -1,179 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# -# This replay file tests cache_open_write_fail_action = 5 (READ_RETRY) -# with a basic scenario where read-while-writer eventually succeeds. -# -# Scenario: -# - Two concurrent requests arrive for the same uncached URL -# - Request 1 gets write lock, connects to origin (slow, 2 second response) -# - Request 2 fails write lock, enters READ_RETRY mode -# - Request 2 retries reads, eventually gets read-while-writer access -# - Both requests should succeed with 200 responses -# - Only ONE origin connection should be made (request collapsing) -# - -meta: - version: "1.0" - -autest: - description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer' - dns: - name: 'dns-read-retry-basic' - - server: - name: 'origin-read-retry-basic' - - client: - name: 'client-read-retry-basic' - - ats: - name: 'ts-read-retry-basic' - process_config: - enable_cache: true - - records_config: - proxy.config.diags.debug.enabled: 1 - proxy.config.diags.debug.tags: 'http|cache|http_cache' - # Enable READ_RETRY mode - proxy.config.http.cache.open_write_fail_action: 5 - # Configure retry parameters - proxy.config.http.cache.max_open_write_retries: 1 - proxy.config.http.cache.max_open_write_retry_timeout: 0 - proxy.config.http.cache.max_open_read_retries: 20 - proxy.config.http.cache.open_read_retry_time: 100 - proxy.config.cache.enable_read_while_writer: 1 - - remap_config: - - from: "http://example.com/" - to: "http://backend.example.com:{SERVER_HTTP_PORT}/" - - log_validation: - traffic_out: - # Should NOT contain crash indicators - excludes: - - expression: "FATAL|ALERT|Emergency|ink_release_assert" - description: "Verify ATS does not crash with READ_RETRY mode" - # Should contain cache operations (basic validation) - contains: - - expression: "state_cache_open_write|CACHE_EVENT_OPEN" - description: "Verify cache operations occur" - -sessions: - ############################################################################# - # First session: Request that gets the write lock (slow origin response) - ############################################################################# - - transactions: - - client-request: - method: "GET" - version: "1.1" - url: /bigfile - headers: - fields: - - [uuid, first-request-write-lock] - - [Host, example.com] - - [X-Request, first] - - proxy-request: - headers: - fields: - - [X-Request, {value: 'first', as: equal}] - - server-response: - # Slow response (2 seconds) to allow second request to retry reads - delay: 2s - status: 200 - reason: OK - headers: - fields: - - [Content-Length, 100] - - [Cache-Control, "max-age=300"] - - [X-Response, from-origin] - - proxy-response: - status: 200 - headers: - fields: - - [X-Response, {value: 'from-origin', as: equal}] - - ############################################################################# - # Second session: Concurrent request that should use READ_RETRY - # This runs in parallel with the first session - ############################################################################# - - transactions: - - client-request: - # Small delay to ensure first request gets write lock first - delay: 100ms - method: "GET" - version: "1.1" - url: /bigfile - headers: - fields: - - [uuid, second-request-read-retry] - - [Host, example.com] - - [X-Request, second] - - # Server should NOT receive this request (read-while-writer should work) - # If server receives it, request collapsing failed - server-response: - status: 200 - reason: OK - headers: - fields: - - [Content-Length, 100] - - [Cache-Control, "max-age=300"] - - [X-Response, duplicate-origin-connection] - - # Proxy should respond with 200, either from read-while-writer or cache - proxy-response: - status: 200 - headers: - fields: - # Should get the response, but preferably from read-while-writer - - [X-Response, {value: 'from-origin', as: equal}] - - ############################################################################# - # Third session: Verify content is cached after both requests complete - ############################################################################# - - transactions: - - client-request: - # Wait for previous transactions to complete - delay: 4s - method: "GET" - version: "1.1" - url: /bigfile - headers: - fields: - - [uuid, third-request-cache-hit] - - [Host, example.com] - - [X-Request, third] - - # Server should NOT receive this request (should be cache hit) - server-response: - status: 400 - reason: OK - headers: - fields: - - [Content-Length, 100] - - [X-Response, should-not-see-this] - - # Proxy should respond from cache - proxy-response: - status: 200 - headers: - fields: - - [X-Response, {value: 'from-origin', as: equal}] diff --git a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry.replay.yaml similarity index 99% rename from tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml rename to tests/gold_tests/cache/replay/cache-read-retry.replay.yaml index 380a880c16a..49e46e887c4 100644 --- a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml +++ b/tests/gold_tests/cache/replay/cache-read-retry.replay.yaml @@ -60,7 +60,7 @@ autest: # Configure retry parameters to exhaust quickly proxy.config.http.cache.max_open_write_retries: 1 proxy.config.http.cache.max_open_write_retry_timeout: 0 - # Only 5 read retries @ 100ms = 500ms total (not enough for 10s origin) + # Only 5 read retries @ 100ms = 500ms total (not enough for 3s origin) proxy.config.http.cache.max_open_read_retries: 5 proxy.config.http.cache.open_read_retry_time: 100 proxy.config.cache.enable_read_while_writer: 1