From 6b08fc7fdffe18dadec7dad3536a2b3448a73054 Mon Sep 17 00:00:00 2001
From: Evan Zelkowitz <e_zelkowitz@apple.com>
Date: Fri, 30 Jan 2026 13:52:24 -0700
Subject: [PATCH 1/6] Add autest for fail action 5 cache read retry mode add
 fixes around read_retry write fail action 5

---
 src/proxy/http/HttpTransact.cc                |  12 +
 .../cache/cache-read-retry-mode.test.py       | 282 ++++++++++++++++++
 .../replay/cache-read-retry-basic.replay.yaml | 180 +++++++++++
 .../cache-read-retry-exhausted.replay.yaml    | 235 +++++++++++++++
 4 files changed, 709 insertions(+)
 create mode 100644 tests/gold_tests/cache/cache-read-retry-mode.test.py
 create mode 100644 tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
 create mode 100644 tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml

diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc
index 496699d1839..17d82c20e76 100644
--- a/src/proxy/http/HttpTransact.cc
+++ b/src/proxy/http/HttpTransact.cc
@@ -3270,6 +3270,11 @@ HttpTransact::HandleCacheOpenReadMiss(State *s)
     s->cache_info.action = CacheAction_t::NO_ACTION;
   } else if (s->api_server_response_no_store) { // plugin may have decided not to cache the response
     s->cache_info.action = CacheAction_t::NO_ACTION;
+  } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY) {
+    // We've looped back around due to failing to read during READ_RETRY mode.
+    // Don't attempt another cache write - just proxy to origin without caching.
+    TxnDbg(dbg_ctl_http_trans, "READ_RETRY cache read failed, bypassing cache");
+    s->cache_info.action = CacheAction_t::NO_ACTION;
   } else {
     HttpTransact::set_cache_prepare_write_action_for_new_request(s);
   }
@@ -3342,6 +3347,13 @@ HttpTransact::set_cache_prepare_write_action_for_new_request(State *s)
     // don't have a state for that.
     ink_release_assert(s->redirect_info.redirect_in_process);
     s->cache_info.action = CacheAction_t::WRITE;
+  } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && !s->redirect_info.redirect_in_process) {
+    // Defensive: Should not reach here if HandleCacheOpenReadMiss check is working.
+    // If we somehow get here in READ_RETRY state (outside of redirect), bypass cache.
+    // Note: For redirects, we allow normal handling since the redirect URL has a different
+    // cache key and there's no write lock conflict.
+    Warning("set_cache_prepare_write_action_for_new_request called with READ_RETRY state");
+    s->cache_info.action = CacheAction_t::NO_ACTION;
   } else {
     s->cache_info.action           = CacheAction_t::PREPARE_TO_WRITE;
     s->cache_info.write_lock_state = HttpTransact::CacheWriteLock_t::INIT;
diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py
new file mode 100644
index 00000000000..ea4a4c2fbef
--- /dev/null
+++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py
@@ -0,0 +1,282 @@
+'''
+Test cache_open_write_fail_action = 5 (READ_RETRY mode)
+'''
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+Test.Summary = '''
+Test cache_open_write_fail_action = 5 (READ_RETRY mode) to verify:
+1. Basic read-while-writer behavior with fail_action=5
+2. Multiple concurrent requests for same uncached URL trigger READ_RETRY
+3. Stale content revalidation with concurrent requests triggers READ_RETRY
+4. System does not crash under write lock contention
+5. All requests are served correctly
+'''
+
+Test.ContinueOnFail = True
+
+# Basic tests using Proxy Verifier replays (sequential, validate basic behavior)
+Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml")
+Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml")
+
+
+class ReadRetryParallelTest:
+    """
+    Test that READ_RETRY mode (fail_action=5) handles write lock contention correctly.
+
+    Strategy:
+    1. Fire multiple parallel requests for the same uncached URL
+    2. Origin server responds slowly (2 seconds via httpbin /delay)
+    3. First request gets write lock and starts writing to cache
+    4. Subsequent requests fail to get write lock, enter READ_RETRY
+    5. All requests should complete successfully (no crashes)
+    """
+
+    def __init__(self):
+        self.__setup_origin_server()
+        self.__setup_ts()
+
+    def __setup_origin_server(self):
+        # Use httpbin which handles concurrent requests and has /delay endpoint
+        self.server = Test.MakeHttpBinServer("httpbin-parallel")
+
+    def __setup_ts(self):
+        self.ts = Test.MakeATSProcess("ts-parallel", enable_cache=True)
+
+        self.ts.Disk.records_config.update(
+            {
+                'proxy.config.diags.debug.enabled': 1,
+                'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans',
+                # Enable READ_RETRY mode
+                'proxy.config.http.cache.open_write_fail_action': 5,
+                # Configure write retry parameters - low count to fall back to READ_RETRY quickly
+                'proxy.config.http.cache.max_open_write_retries': 1,
+                'proxy.config.http.cache.max_open_write_retry_timeout': 5,
+                # Configure read retry parameters
+                'proxy.config.http.cache.max_open_read_retries': 10,
+                'proxy.config.http.cache.open_read_retry_time': 100,
+                # Enable read-while-writer
+                'proxy.config.cache.enable_read_while_writer': 1,
+                # Required for caching
+                'proxy.config.http.cache.required_headers': 0,
+            })
+
+        self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/')
+
+    def test_parallel_requests(self):
+        """
+        Fire multiple curl requests in parallel using shell backgrounding.
+        All requests should complete successfully.
+        """
+        tr = Test.AddTestRun("Parallel requests for same uncached URL")
+        ps = tr.Processes.Default
+
+        ps.StartBefore(self.server)
+        ps.StartBefore(self.ts)
+
+        # Fire 5 parallel requests using shell backgrounding
+        # The /delay/2 endpoint makes origin respond after 2 seconds
+        # First request gets the write lock, others should enter READ_RETRY
+        port = self.ts.Variables.port
+        tr.MakeCurlCommandMulti(
+            f'''
+# Fire all requests in parallel (backgrounded) to same slow URL
+({{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
+({{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
+({{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
+({{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
+({{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
+# Wait for all background jobs to complete
+wait
+echo "All parallel requests completed"
+''',
+            ts=self.ts)
+
+        ps.ReturnCode = 0
+        # All requests should return HTTP 200
+        ps.Streams.stdout.Content = Testers.ContainsExpression("Request 1: HTTP 200", "First request should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 2: HTTP 200", "Second request should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 3: HTTP 200", "Third request should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 4: HTTP 200", "Fourth request should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 5: HTTP 200", "Fifth request should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("All parallel requests completed", "All requests should complete")
+
+        tr.StillRunningAfter = self.ts
+
+    def test_verify_cache_populated(self):
+        """
+        Verify the cache was populated after parallel requests.
+        Subsequent request should be fast (from cache).
+        """
+        tr = Test.AddTestRun("Verify cache hit after parallel requests")
+        ps = tr.Processes.Default
+
+        port = self.ts.Variables.port
+        # Request without delay - should be served from cache quickly
+        tr.MakeCurlCommandMulti(
+            f'''
+{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/2
+''', ts=self.ts)
+
+        ps.ReturnCode = 0
+        ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Request should succeed")
+
+        tr.StillRunningAfter = self.ts
+
+    def test_no_crash_indicators(self):
+        """
+        Verify ATS logs don't contain crash indicators.
+        """
+        tr = Test.AddTestRun("Verify no crash indicators in logs")
+        ps = tr.Processes.Default
+
+        # Check traffic.out for crash indicators
+        ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"'
+
+        ps.ReturnCode = 0
+        ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators")
+
+        tr.StillRunningAfter = self.ts
+
+    def run(self):
+        self.test_parallel_requests()
+        self.test_verify_cache_populated()
+        self.test_no_crash_indicators()
+
+
+class ReadRetryRevalidationTest:
+    """
+    Test that READ_RETRY mode handles stale content revalidation with concurrent requests.
+
+    Strategy:
+    1. Populate cache with content that has a short TTL
+    2. Wait for content to become stale
+    3. Fire multiple parallel requests for the stale URL
+    4. All requests see stale content and attempt to revalidate
+    5. First request gets write lock, others enter READ_RETRY
+    6. All requests should complete successfully
+    """
+
+    def __init__(self):
+        self.__setup_origin_server()
+        self.__setup_ts()
+
+    def __setup_origin_server(self):
+        # Use httpbin for slow revalidation responses
+        self.server = Test.MakeHttpBinServer("httpbin-revalidate")
+
+    def __setup_ts(self):
+        self.ts = Test.MakeATSProcess("ts-revalidate", enable_cache=True)
+
+        self.ts.Disk.records_config.update(
+            {
+                'proxy.config.diags.debug.enabled': 1,
+                'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans',
+                # Enable READ_RETRY mode
+                'proxy.config.http.cache.open_write_fail_action': 5,
+                # Configure write retry parameters
+                'proxy.config.http.cache.max_open_write_retries': 1,
+                'proxy.config.http.cache.max_open_write_retry_timeout': 5,
+                # Configure read retry parameters
+                'proxy.config.http.cache.max_open_read_retries': 10,
+                'proxy.config.http.cache.open_read_retry_time': 100,
+                # Enable read-while-writer
+                'proxy.config.cache.enable_read_while_writer': 1,
+                # Required for caching
+                'proxy.config.http.cache.required_headers': 0,
+            })
+
+        self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/')
+
+    def test_populate_cache(self):
+        """
+        Populate cache with content that has short TTL.
+        Using /cache/{seconds} which returns Cache-Control: max-age={seconds}
+        """
+        tr = Test.AddTestRun("Populate cache with short-TTL content")
+        ps = tr.Processes.Default
+
+        ps.StartBefore(self.server)
+        ps.StartBefore(self.ts)
+
+        port = self.ts.Variables.port
+        # /cache/1 returns content with max-age=1
+        tr.MakeCurlCommandMulti(
+            f'''
+{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/cache/1 && echo "Cache populated with 1-second TTL content"
+''',
+            ts=self.ts)
+
+        ps.ReturnCode = 0
+        ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Initial request should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Cache populated", "Cache population message")
+
+        tr.StillRunningAfter = self.ts
+
+    def test_parallel_revalidation(self):
+        """
+        Wait for content to become stale, then send parallel revalidation requests.
+        """
+        tr = Test.AddTestRun("Parallel revalidation requests for stale content")
+        ps = tr.Processes.Default
+
+        port = self.ts.Variables.port
+        # Wait 2 seconds for content to become stale, then fire parallel requests
+        # Use /delay/2 for the revalidation to be slow (simulating slow origin)
+        tr.MakeCurlCommandMulti(
+            f'''
+sleep 2 && ({{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
+({{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
+({{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
+({{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
+wait && echo "All revalidation requests completed"
+''',
+            ts=self.ts)
+
+        ps.ReturnCode = 0
+        # All revalidation requests should succeed
+        ps.Streams.stdout.Content = Testers.ContainsExpression("Revalidate 1: HTTP 200", "First revalidation should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 2: HTTP 200", "Second revalidation should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 3: HTTP 200", "Third revalidation should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 4: HTTP 200", "Fourth revalidation should succeed")
+        ps.Streams.stdout.Content += Testers.ContainsExpression(
+            "All revalidation requests completed", "All requests should complete")
+
+        tr.StillRunningAfter = self.ts
+
+    def test_no_crash_indicators(self):
+        """
+        Verify ATS logs don't contain crash indicators.
+        """
+        tr = Test.AddTestRun("Verify no crash indicators after revalidation")
+        ps = tr.Processes.Default
+
+        ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"'
+
+        ps.ReturnCode = 0
+        ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators")
+
+        tr.StillRunningAfter = self.ts
+
+    def run(self):
+        self.test_populate_cache()
+        self.test_parallel_revalidation()
+        self.test_no_crash_indicators()
+
+
+# Run the parallel tests that actually trigger READ_RETRY
+ReadRetryParallelTest().run()
+ReadRetryRevalidationTest().run()
diff --git a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
new file mode 100644
index 00000000000..dd2c34e3b1d
--- /dev/null
+++ b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
@@ -0,0 +1,180 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+#
+# This replay file tests cache_open_write_fail_action = 5 (READ_RETRY)
+# with a basic scenario where read-while-writer eventually succeeds.
+#
+# Scenario:
+# - Two concurrent requests arrive for the same uncached URL
+# - Request 1 gets write lock, connects to origin (slow, 3 second response)
+# - Request 2 fails write lock, enters READ_RETRY mode
+# - Request 2 retries reads, eventually gets read-while-writer access
+# - Both requests should succeed with 200 responses
+# - Only ONE origin connection should be made (request collapsing)
+#
+
+meta:
+  version: "1.0"
+
+autest:
+  dns:
+    name: 'dns-read-retry-basic'
+  description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer'
+
+  server:
+    name: 'origin-read-retry-basic'
+
+  client:
+    name: 'client-read-retry-basic'
+
+  ats:
+    name: 'ts-read-retry-basic'
+    process_config:
+      enable_cache: true
+
+    records_config:
+      proxy.config.diags.debug.enabled: 1
+      proxy.config.diags.debug.tags: 'http|cache|http_cache'
+      # Enable READ_RETRY mode
+      proxy.config.http.cache.open_write_fail_action: 5
+      # Configure retry parameters
+      proxy.config.http.cache.max_open_write_retries: 1
+      proxy.config.http.cache.max_open_write_retry_timeout: 0
+      proxy.config.http.cache.max_open_read_retries: 20
+      proxy.config.http.cache.open_read_retry_time: 100
+      # Enable read-while-writer
+      proxy.config.cache.enable_read_while_writer: 1
+
+    remap_config:
+      - from: "http://example.com/"
+        to: "http://backend.example.com:{SERVER_HTTP_PORT}/"
+
+    log_validation:
+      traffic_out:
+        # Should NOT contain crash indicators
+        excludes:
+          - expression: "FATAL|ALERT|Emergency|ink_release_assert"
+            description: "Verify ATS does not crash with READ_RETRY mode"
+        # Should contain cache operations (basic validation)
+        contains:
+          - expression: "state_cache_open_write|CACHE_EVENT_OPEN"
+            description: "Verify cache operations occur"
+
+sessions:
+  #############################################################################
+  # First session: Request that gets the write lock (slow origin response)
+  #############################################################################
+  - transactions:
+      - client-request:
+          method: "GET"
+          version: "1.1"
+          url: /bigfile
+          headers:
+            fields:
+              - [uuid, first-request-write-lock]
+              - [Host, example.com]
+              - [X-Request, first]
+
+        proxy-request:
+          headers:
+            fields:
+              - [X-Request, {value: 'first', as: equal}]
+
+        server-response:
+          # Slow response (3 seconds) to allow second request to retry reads
+          delay: 3s
+          status: 200
+          reason: OK
+          headers:
+            fields:
+              - [Content-Length, 100]
+              - [Cache-Control, "max-age=300"]
+              - [X-Response, from-origin]
+
+        proxy-response:
+          status: 200
+          headers:
+            fields:
+              - [X-Response, {value: 'from-origin', as: equal}]
+
+  #############################################################################
+  # Second session: Concurrent request that should use READ_RETRY
+  # This runs in parallel with the first session
+  #############################################################################
+  - transactions:
+      - client-request:
+          # Small delay to ensure first request gets write lock first
+          delay: 100ms
+          method: "GET"
+          version: "1.1"
+          url: /bigfile
+          headers:
+            fields:
+              - [uuid, second-request-read-retry]
+              - [Host, example.com]
+              - [X-Request, second]
+
+        # Server should NOT receive this request (read-while-writer should work)
+        # If server receives it, request collapsing failed
+        server-response:
+          status: 200
+          reason: OK
+          headers:
+            fields:
+              - [Content-Length, 100]
+              - [Cache-Control, "max-age=300"]
+              - [X-Response, duplicate-origin-connection]
+
+        # Proxy should respond with 200, either from read-while-writer or cache
+        proxy-response:
+          status: 200
+          headers:
+            fields:
+              # Should get the response, but preferably from read-while-writer
+              - [X-Response, {value: 'from-origin', as: equal}]
+
+  #############################################################################
+  # Third session: Verify content is cached after both requests complete
+  #############################################################################
+  - transactions:
+      - client-request:
+          # Wait for previous transactions to complete
+          delay: 4s
+          method: "GET"
+          version: "1.1"
+          url: /bigfile
+          headers:
+            fields:
+              - [uuid, third-request-cache-hit]
+              - [Host, example.com]
+              - [X-Request, third]
+
+        # Server should NOT receive this request (should be cache hit)
+        server-response:
+          status: 200
+          reason: OK
+          headers:
+            fields:
+              - [Content-Length, 100]
+              - [X-Response, should-not-see-this]
+
+        # Proxy should respond from cache
+        proxy-response:
+          status: 200
+          headers:
+            fields:
+              - [X-Response, {value: 'from-origin', as: equal}]
diff --git a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml
new file mode 100644
index 00000000000..108a5fd3e10
--- /dev/null
+++ b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml
@@ -0,0 +1,235 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+#
+# This replay file tests cache_open_write_fail_action = 5 (READ_RETRY)
+# with concurrent requests and slow origin responses.
+#
+# Scenario:
+# - Multiple concurrent requests arrive for the same uncached URL
+# - Request 1 gets write lock, connects to origin (slow, 3 second response)
+# - Request 2-3 try to read, retry reads, eventually get read-while-writer access
+# - All requests should succeed with 200 responses from read-while-writer
+# - Only ONE origin connection should be made (request collapsing works)
+# - System should NOT crash (validates stability with fail_action=5 enabled)
+#
+# Expected Behavior:
+# 1. Request collapsing works correctly (all requests get first-origin response)
+# 2. Read-while-writer functions properly
+# 3. No crashes occur
+# 4. Content is cached properly
+#
+
+meta:
+  version: "1.0"
+
+autest:
+  dns:
+    name: 'dns-read-retry-exhausted'
+  description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with slow origin and concurrent requests to verify request collapsing and stability'
+
+  server:
+    name: 'origin-read-retry-exhausted'
+
+  client:
+    name: 'client-read-retry-exhausted'
+
+  ats:
+    name: 'ts-read-retry-exhausted'
+    process_config:
+      enable_cache: true
+
+    records_config:
+      proxy.config.diags.debug.enabled: 1
+      proxy.config.diags.debug.tags: 'http|cache|http_cache|http_trans'
+      # Enable READ_RETRY mode
+      proxy.config.http.cache.open_write_fail_action: 5
+      # Configure retry parameters to exhaust quickly
+      proxy.config.http.cache.max_open_write_retries: 1
+      proxy.config.http.cache.max_open_write_retry_timeout: 0
+      # Only 5 read retries @ 100ms = 500ms total (not enough for 10s origin)
+      proxy.config.http.cache.max_open_read_retries: 5
+      proxy.config.http.cache.open_read_retry_time: 100
+      # Enable read-while-writer
+      proxy.config.cache.enable_read_while_writer: 1
+
+    remap_config:
+      - from: "http://example.com/"
+        to: "http://backend.example.com:{SERVER_HTTP_PORT}/"
+
+    log_validation:
+      traffic_out:
+        # Should NOT contain crash indicators
+        excludes:
+          - expression: "FATAL|ALERT|Emergency|ink_release_assert|ink_abort"
+            description: "Verify ATS does not crash with READ_RETRY mode enabled"
+        # Should contain cache read retries (shows the system is working)
+        contains:
+          - expression: "retrying cache open read|read while write"
+            description: "Verify cache retry mechanism is active"
+
+sessions:
+  #############################################################################
+  # First session: Request that gets the write lock (VERY slow origin)
+  #############################################################################
+  - transactions:
+      - client-request:
+          method: "GET"
+          version: "1.1"
+          url: /slowfile
+          headers:
+            fields:
+              - [uuid, first-request-slow-origin]
+              - [Host, example.com]
+              - [X-Request, first-slow]
+
+        proxy-request:
+          headers:
+            fields:
+              - [X-Request, {value: 'first-slow', as: equal}]
+
+        server-response:
+          # Slow response (3 seconds) to allow second request to retry reads
+          # Must be less than client timeout (5s) but long enough to trigger retries
+          delay: 3s
+          status: 200
+          reason: OK
+          headers:
+            fields:
+              - [Content-Length, 200]
+              - [Cache-Control, "max-age=300"]
+              - [X-Response, first-origin]
+              - [X-Origin-Request, first]
+
+        proxy-response:
+          status: 200
+          headers:
+            fields:
+              - [X-Response, {value: 'first-origin', as: equal}]
+
+  #############################################################################
+  # Second session: Concurrent request that exhausts READ_RETRY attempts
+  # This runs in parallel and triggers the bug path
+  #############################################################################
+  - transactions:
+      - client-request:
+          # Small delay to ensure first request gets write lock first
+          delay: 50ms
+          method: "GET"
+          version: "1.1"
+          url: /slowfile
+          headers:
+            fields:
+              - [uuid, second-request-exhausted]
+              - [Host, example.com]
+              - [X-Request, second-exhausted]
+
+        proxy-request:
+          headers:
+            fields:
+              - [X-Request, {value: 'second-exhausted', as: equal}]
+
+        # This request should NOT reach the server if read-while-writer works
+        # It should be served from Request 1's response
+        server-response:
+          status: 200
+          reason: OK
+          headers:
+            fields:
+              - [Content-Length, 200]
+              - [Cache-Control, "max-age=300"]
+              - [X-Response, second-origin]
+              - [X-Origin-Request, second]
+
+        # Proxy should respond with 200 from read-while-writer (first request's response)
+        proxy-response:
+          status: 200
+          headers:
+            fields:
+              # Should get first-origin via read-while-writer (request collapsing works)
+              - [X-Response, {value: 'first-origin', as: equal}]
+
+  #############################################################################
+  # Third session: Another concurrent request to stress test the system
+  #############################################################################
+  - transactions:
+      - client-request:
+          # Arrives after second request starts but before first completes
+          delay: 100ms
+          method: "GET"
+          version: "1.1"
+          url: /slowfile
+          headers:
+            fields:
+              - [uuid, third-request-concurrent]
+              - [Host, example.com]
+              - [X-Request, third-concurrent]
+
+        proxy-request:
+          headers:
+            fields:
+              - [X-Request, {value: 'third-concurrent', as: equal}]
+
+        # This request will also be served via read-while-writer
+        server-response:
+          status: 200
+          reason: OK
+          headers:
+            fields:
+              - [Content-Length, 200]
+              - [Cache-Control, "max-age=300"]
+              - [X-Response, third-origin]
+              - [X-Origin-Request, third]
+
+        # Should get a response from read-while-writer
+        proxy-response:
+          status: 200
+          headers:
+            fields:
+              # Should get first-origin via read-while-writer
+              - [X-Response, {value: 'first-origin', as: equal}]
+
+  #############################################################################
+  # Fourth session: Verify cache state after all the chaos
+  #############################################################################
+  - transactions:
+      - client-request:
+          # Wait for all previous transactions to complete (3s origin + buffer)
+          delay: 5s
+          method: "GET"
+          version: "1.1"
+          url: /slowfile
+          headers:
+            fields:
+              - [uuid, fourth-request-verify]
+              - [Host, example.com]
+              - [X-Request, fourth-verify]
+
+        # Server should NOT receive this request (should be cache hit)
+        server-response:
+          status: 200
+          reason: OK
+          headers:
+            fields:
+              - [Content-Length, 200]
+              - [X-Response, should-not-reach-origin]
+
+        # Proxy should respond from cache with the first-origin response
+        proxy-response:
+          status: 200
+          headers:
+            fields:
+              - [X-Response, {value: 'first-origin', as: equal}]

From b085b78e35578bba47bf8f3cb5272b5c8aff89b5 Mon Sep 17 00:00:00 2001
From: Evan Zelkowitz <e_zelkowitz@apple.com>
Date: Fri, 30 Jan 2026 17:11:46 -0700
Subject: [PATCH 2/6] Fix issue when in redirect state but using original
 cachekey

---
 src/proxy/http/HttpTransact.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc
index 17d82c20e76..5a9a3eba002 100644
--- a/src/proxy/http/HttpTransact.cc
+++ b/src/proxy/http/HttpTransact.cc
@@ -3347,11 +3347,13 @@ HttpTransact::set_cache_prepare_write_action_for_new_request(State *s)
     // don't have a state for that.
     ink_release_assert(s->redirect_info.redirect_in_process);
     s->cache_info.action = CacheAction_t::WRITE;
-  } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY && !s->redirect_info.redirect_in_process) {
+  } else if (s->cache_info.write_lock_state == CacheWriteLock_t::READ_RETRY &&
+             (!s->redirect_info.redirect_in_process || s->txn_conf->redirect_use_orig_cache_key)) {
     // Defensive: Should not reach here if HandleCacheOpenReadMiss check is working.
-    // If we somehow get here in READ_RETRY state (outside of redirect), bypass cache.
-    // Note: For redirects, we allow normal handling since the redirect URL has a different
-    // cache key and there's no write lock conflict.
+    // If we somehow get here in READ_RETRY state, bypass cache unless we're in a redirect
+    // that uses a different cache key (redirect_use_orig_cache_key == 0).
+    // When redirect_use_orig_cache_key is enabled, the redirect uses the same cache key
+    // as the original request, so we'd hit the same write lock contention.
     Warning("set_cache_prepare_write_action_for_new_request called with READ_RETRY state");
     s->cache_info.action = CacheAction_t::NO_ACTION;
   } else {

From ae500e379da623fad4103df02c231489ce2e23c9 Mon Sep 17 00:00:00 2001
From: Evan Zelkowitz <e_zelkowitz@apple.com>
Date: Fri, 30 Jan 2026 18:10:33 -0700
Subject: [PATCH 3/6] Updating test, i think these failures are timing issues
 so may have to just scrap them

---
 .../cache/cache-read-retry-mode.test.py       | 60 +++++++++++--------
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py
index ea4a4c2fbef..e9a4f1a4055 100644
--- a/tests/gold_tests/cache/cache-read-retry-mode.test.py
+++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py
@@ -39,7 +39,7 @@ class ReadRetryParallelTest:
 
     Strategy:
     1. Fire multiple parallel requests for the same uncached URL
-    2. Origin server responds slowly (2 seconds via httpbin /delay)
+    2. Origin server responds slowly (3 seconds via httpbin /delay)
     3. First request gets write lock and starts writing to cache
     4. Subsequent requests fail to get write lock, enter READ_RETRY
     5. All requests should complete successfully (no crashes)
@@ -87,19 +87,19 @@ def test_parallel_requests(self):
         ps.StartBefore(self.server)
         ps.StartBefore(self.ts)
 
-        # Fire 5 parallel requests using shell backgrounding
-        # The /delay/2 endpoint makes origin respond after 2 seconds
-        # First request gets the write lock, others should enter READ_RETRY
+        # Fire 5 parallel requests using direct backgrounding
+        # The /delay/3 endpoint makes origin respond after 3 seconds
+        # This provides ample overlap time for write lock contention
         port = self.ts.Variables.port
         tr.MakeCurlCommandMulti(
             f'''
-# Fire all requests in parallel (backgrounded) to same slow URL
-({{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
-({{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
-({{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
-({{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
-({{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/2) &
-# Wait for all background jobs to complete
+# Launch all curl requests in rapid succession (no subshells for faster startup)
+# The 3-second origin delay provides ample overlap time for write lock contention
+{{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+{{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+{{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+{{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+{{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
 wait
 echo "All parallel requests completed"
 ''',
@@ -125,10 +125,10 @@ def test_verify_cache_populated(self):
         ps = tr.Processes.Default
 
         port = self.ts.Variables.port
-        # Request without delay - should be served from cache quickly
+        # Request same URL - should be served from cache quickly
         tr.MakeCurlCommandMulti(
             f'''
-{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/2
+{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3
 ''', ts=self.ts)
 
         ps.ReturnCode = 0
@@ -197,26 +197,30 @@ def __setup_ts(self):
                 'proxy.config.cache.enable_read_while_writer': 1,
                 # Required for caching
                 'proxy.config.http.cache.required_headers': 0,
+                # Set short heuristic freshness so content becomes stale quickly
+                'proxy.config.http.cache.heuristic_min_lifetime': 1,
+                'proxy.config.http.cache.heuristic_max_lifetime': 2,
             })
 
         self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/')
 
     def test_populate_cache(self):
         """
-        Populate cache with content that has short TTL.
-        Using /cache/{seconds} which returns Cache-Control: max-age={seconds}
+        Populate cache with content using a slow origin response.
+        The /delay/3 endpoint returns after 3 seconds. With required_headers=0,
+        ATS will cache this content using heuristic freshness.
         """
-        tr = Test.AddTestRun("Populate cache with short-TTL content")
+        tr = Test.AddTestRun("Populate cache with content")
         ps = tr.Processes.Default
 
         ps.StartBefore(self.server)
         ps.StartBefore(self.ts)
 
         port = self.ts.Variables.port
-        # /cache/1 returns content with max-age=1
+        # Use /delay/3 - same URL as revalidation test to ensure cache hit/revalidation
         tr.MakeCurlCommandMulti(
             f'''
-{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/cache/1 && echo "Cache populated with 1-second TTL content"
+{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3 && echo "Cache populated"
 ''',
             ts=self.ts)
 
@@ -229,20 +233,26 @@ def test_populate_cache(self):
     def test_parallel_revalidation(self):
         """
         Wait for content to become stale, then send parallel revalidation requests.
+        All requests should complete successfully.
         """
         tr = Test.AddTestRun("Parallel revalidation requests for stale content")
         ps = tr.Processes.Default
 
         port = self.ts.Variables.port
-        # Wait 2 seconds for content to become stale, then fire parallel requests
-        # Use /delay/2 for the revalidation to be slow (simulating slow origin)
+        # Wait 3 seconds for content to become stale, then fire parallel requests
         tr.MakeCurlCommandMulti(
             f'''
-sleep 2 && ({{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
-({{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
-({{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
-({{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}" -H "Host: example.com" http://127.0.0.1:{port}/delay/2 && echo) &
-wait && echo "All revalidation requests completed"
+# Wait for content to become stale
+sleep 3
+
+# Launch all curl requests in rapid succession (no subshells for faster startup)
+# The 3-second origin delay provides ample overlap time for write lock contention
+{{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+{{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+{{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+{{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
+wait
+echo "All revalidation requests completed"
 ''',
             ts=self.ts)
 

From ba3c301090433375bb4d17f89697b943b182772c Mon Sep 17 00:00:00 2001
From: Evan Zelkowitz <e_zelkowitz@apple.com>
Date: Fri, 30 Jan 2026 18:41:32 -0700
Subject: [PATCH 4/6] Removing timing dependent tests

---
 .../cache/cache-read-retry-mode.test.py       | 269 +-----------------
 1 file changed, 5 insertions(+), 264 deletions(-)

diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py
index e9a4f1a4055..e5554c3eea1 100644
--- a/tests/gold_tests/cache/cache-read-retry-mode.test.py
+++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py
@@ -20,273 +20,14 @@
 Test.Summary = '''
 Test cache_open_write_fail_action = 5 (READ_RETRY mode) to verify:
 1. Basic read-while-writer behavior with fail_action=5
-2. Multiple concurrent requests for same uncached URL trigger READ_RETRY
-3. Stale content revalidation with concurrent requests triggers READ_RETRY
-4. System does not crash under write lock contention
-5. All requests are served correctly
+2. READ_RETRY mode configuration is accepted and functional
+3. System does not crash under write lock contention
+4. Requests are served correctly when read retries are exhausted
 '''
 
 Test.ContinueOnFail = True
 
-# Basic tests using Proxy Verifier replays (sequential, validate basic behavior)
+# Tests using Proxy Verifier replays (deterministic, CI-friendly)
+# These validate basic READ_RETRY behavior without timing-sensitive parallel requests
 Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml")
 Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml")
-
-
-class ReadRetryParallelTest:
-    """
-    Test that READ_RETRY mode (fail_action=5) handles write lock contention correctly.
-
-    Strategy:
-    1. Fire multiple parallel requests for the same uncached URL
-    2. Origin server responds slowly (3 seconds via httpbin /delay)
-    3. First request gets write lock and starts writing to cache
-    4. Subsequent requests fail to get write lock, enter READ_RETRY
-    5. All requests should complete successfully (no crashes)
-    """
-
-    def __init__(self):
-        self.__setup_origin_server()
-        self.__setup_ts()
-
-    def __setup_origin_server(self):
-        # Use httpbin which handles concurrent requests and has /delay endpoint
-        self.server = Test.MakeHttpBinServer("httpbin-parallel")
-
-    def __setup_ts(self):
-        self.ts = Test.MakeATSProcess("ts-parallel", enable_cache=True)
-
-        self.ts.Disk.records_config.update(
-            {
-                'proxy.config.diags.debug.enabled': 1,
-                'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans',
-                # Enable READ_RETRY mode
-                'proxy.config.http.cache.open_write_fail_action': 5,
-                # Configure write retry parameters - low count to fall back to READ_RETRY quickly
-                'proxy.config.http.cache.max_open_write_retries': 1,
-                'proxy.config.http.cache.max_open_write_retry_timeout': 5,
-                # Configure read retry parameters
-                'proxy.config.http.cache.max_open_read_retries': 10,
-                'proxy.config.http.cache.open_read_retry_time': 100,
-                # Enable read-while-writer
-                'proxy.config.cache.enable_read_while_writer': 1,
-                # Required for caching
-                'proxy.config.http.cache.required_headers': 0,
-            })
-
-        self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/')
-
-    def test_parallel_requests(self):
-        """
-        Fire multiple curl requests in parallel using shell backgrounding.
-        All requests should complete successfully.
-        """
-        tr = Test.AddTestRun("Parallel requests for same uncached URL")
-        ps = tr.Processes.Default
-
-        ps.StartBefore(self.server)
-        ps.StartBefore(self.ts)
-
-        # Fire 5 parallel requests using direct backgrounding
-        # The /delay/3 endpoint makes origin respond after 3 seconds
-        # This provides ample overlap time for write lock contention
-        port = self.ts.Variables.port
-        tr.MakeCurlCommandMulti(
-            f'''
-# Launch all curl requests in rapid succession (no subshells for faster startup)
-# The 3-second origin delay provides ample overlap time for write lock contention
-{{curl}} -s -o /dev/null -w "Request 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-{{curl}} -s -o /dev/null -w "Request 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-{{curl}} -s -o /dev/null -w "Request 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-{{curl}} -s -o /dev/null -w "Request 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-{{curl}} -s -o /dev/null -w "Request 5: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-wait
-echo "All parallel requests completed"
-''',
-            ts=self.ts)
-
-        ps.ReturnCode = 0
-        # All requests should return HTTP 200
-        ps.Streams.stdout.Content = Testers.ContainsExpression("Request 1: HTTP 200", "First request should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 2: HTTP 200", "Second request should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 3: HTTP 200", "Third request should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 4: HTTP 200", "Fourth request should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Request 5: HTTP 200", "Fifth request should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("All parallel requests completed", "All requests should complete")
-
-        tr.StillRunningAfter = self.ts
-
-    def test_verify_cache_populated(self):
-        """
-        Verify the cache was populated after parallel requests.
-        Subsequent request should be fast (from cache).
-        """
-        tr = Test.AddTestRun("Verify cache hit after parallel requests")
-        ps = tr.Processes.Default
-
-        port = self.ts.Variables.port
-        # Request same URL - should be served from cache quickly
-        tr.MakeCurlCommandMulti(
-            f'''
-{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3
-''', ts=self.ts)
-
-        ps.ReturnCode = 0
-        ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Request should succeed")
-
-        tr.StillRunningAfter = self.ts
-
-    def test_no_crash_indicators(self):
-        """
-        Verify ATS logs don't contain crash indicators.
-        """
-        tr = Test.AddTestRun("Verify no crash indicators in logs")
-        ps = tr.Processes.Default
-
-        # Check traffic.out for crash indicators
-        ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"'
-
-        ps.ReturnCode = 0
-        ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators")
-
-        tr.StillRunningAfter = self.ts
-
-    def run(self):
-        self.test_parallel_requests()
-        self.test_verify_cache_populated()
-        self.test_no_crash_indicators()
-
-
-class ReadRetryRevalidationTest:
-    """
-    Test that READ_RETRY mode handles stale content revalidation with concurrent requests.
-
-    Strategy:
-    1. Populate cache with content that has a short TTL
-    2. Wait for content to become stale
-    3. Fire multiple parallel requests for the stale URL
-    4. All requests see stale content and attempt to revalidate
-    5. First request gets write lock, others enter READ_RETRY
-    6. All requests should complete successfully
-    """
-
-    def __init__(self):
-        self.__setup_origin_server()
-        self.__setup_ts()
-
-    def __setup_origin_server(self):
-        # Use httpbin for slow revalidation responses
-        self.server = Test.MakeHttpBinServer("httpbin-revalidate")
-
-    def __setup_ts(self):
-        self.ts = Test.MakeATSProcess("ts-revalidate", enable_cache=True)
-
-        self.ts.Disk.records_config.update(
-            {
-                'proxy.config.diags.debug.enabled': 1,
-                'proxy.config.diags.debug.tags': 'http|cache|http_cache|http_trans',
-                # Enable READ_RETRY mode
-                'proxy.config.http.cache.open_write_fail_action': 5,
-                # Configure write retry parameters
-                'proxy.config.http.cache.max_open_write_retries': 1,
-                'proxy.config.http.cache.max_open_write_retry_timeout': 5,
-                # Configure read retry parameters
-                'proxy.config.http.cache.max_open_read_retries': 10,
-                'proxy.config.http.cache.open_read_retry_time': 100,
-                # Enable read-while-writer
-                'proxy.config.cache.enable_read_while_writer': 1,
-                # Required for caching
-                'proxy.config.http.cache.required_headers': 0,
-                # Set short heuristic freshness so content becomes stale quickly
-                'proxy.config.http.cache.heuristic_min_lifetime': 1,
-                'proxy.config.http.cache.heuristic_max_lifetime': 2,
-            })
-
-        self.ts.Disk.remap_config.AddLine(f'map http://example.com/ http://127.0.0.1:{self.server.Variables.Port}/')
-
-    def test_populate_cache(self):
-        """
-        Populate cache with content using a slow origin response.
-        The /delay/3 endpoint returns after 3 seconds. With required_headers=0,
-        ATS will cache this content using heuristic freshness.
-        """
-        tr = Test.AddTestRun("Populate cache with content")
-        ps = tr.Processes.Default
-
-        ps.StartBefore(self.server)
-        ps.StartBefore(self.ts)
-
-        port = self.ts.Variables.port
-        # Use /delay/3 - same URL as revalidation test to ensure cache hit/revalidation
-        tr.MakeCurlCommandMulti(
-            f'''
-{{curl}} -s -D - -o /dev/null -H "Host: example.com" http://127.0.0.1:{port}/delay/3 && echo "Cache populated"
-''',
-            ts=self.ts)
-
-        ps.ReturnCode = 0
-        ps.Streams.stdout.Content = Testers.ContainsExpression("200 OK", "Initial request should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Cache populated", "Cache population message")
-
-        tr.StillRunningAfter = self.ts
-
-    def test_parallel_revalidation(self):
-        """
-        Wait for content to become stale, then send parallel revalidation requests.
-        All requests should complete successfully.
-        """
-        tr = Test.AddTestRun("Parallel revalidation requests for stale content")
-        ps = tr.Processes.Default
-
-        port = self.ts.Variables.port
-        # Wait 3 seconds for content to become stale, then fire parallel requests
-        tr.MakeCurlCommandMulti(
-            f'''
-# Wait for content to become stale
-sleep 3
-
-# Launch all curl requests in rapid succession (no subshells for faster startup)
-# The 3-second origin delay provides ample overlap time for write lock contention
-{{curl}} -s -o /dev/null -w "Revalidate 1: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-{{curl}} -s -o /dev/null -w "Revalidate 2: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-{{curl}} -s -o /dev/null -w "Revalidate 3: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-{{curl}} -s -o /dev/null -w "Revalidate 4: HTTP %{{{{http_code}}}}\\n" -H "Host: example.com" http://127.0.0.1:{port}/delay/3 &
-wait
-echo "All revalidation requests completed"
-''',
-            ts=self.ts)
-
-        ps.ReturnCode = 0
-        # All revalidation requests should succeed
-        ps.Streams.stdout.Content = Testers.ContainsExpression("Revalidate 1: HTTP 200", "First revalidation should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 2: HTTP 200", "Second revalidation should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 3: HTTP 200", "Third revalidation should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression("Revalidate 4: HTTP 200", "Fourth revalidation should succeed")
-        ps.Streams.stdout.Content += Testers.ContainsExpression(
-            "All revalidation requests completed", "All requests should complete")
-
-        tr.StillRunningAfter = self.ts
-
-    def test_no_crash_indicators(self):
-        """
-        Verify ATS logs don't contain crash indicators.
-        """
-        tr = Test.AddTestRun("Verify no crash indicators after revalidation")
-        ps = tr.Processes.Default
-
-        ps.Command = f'grep -E "FATAL|ALERT|Emergency|ink_release_assert|ink_abort" {self.ts.Variables.LOGDIR}/traffic.out || echo "No crash indicators found"'
-
-        ps.ReturnCode = 0
-        ps.Streams.stdout.Content = Testers.ContainsExpression("No crash indicators found", "Should not find any crash indicators")
-
-        tr.StillRunningAfter = self.ts
-
-    def run(self):
-        self.test_populate_cache()
-        self.test_parallel_revalidation()
-        self.test_no_crash_indicators()
-
-
-# Run the parallel tests that actually trigger READ_RETRY
-ReadRetryParallelTest().run()
-ReadRetryRevalidationTest().run()

From f0c7f7aaa80263792e299dcdd1b277321fc9f334 Mon Sep 17 00:00:00 2001
From: Evan Zelkowitz <e_zelkowitz@apple.com>
Date: Mon, 2 Feb 2026 10:23:29 -0700
Subject: [PATCH 5/6] Addressing review comments

---
 .../gold_tests/cache/cache-read-retry-mode.test.py  |  2 --
 .../cache/replay/cache-read-retry-basic.replay.yaml | 11 +++++------
 .../replay/cache-read-retry-exhausted.replay.yaml   | 13 ++++++-------
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py
index e5554c3eea1..9029c71328f 100644
--- a/tests/gold_tests/cache/cache-read-retry-mode.test.py
+++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py
@@ -27,7 +27,5 @@
 
 Test.ContinueOnFail = True
 
-# Tests using Proxy Verifier replays (deterministic, CI-friendly)
-# These validate basic READ_RETRY behavior without timing-sensitive parallel requests
 Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml")
 Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml")
diff --git a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
index dd2c34e3b1d..f913d2af75c 100644
--- a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
+++ b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
@@ -20,7 +20,7 @@
 #
 # Scenario:
 # - Two concurrent requests arrive for the same uncached URL
-# - Request 1 gets write lock, connects to origin (slow, 3 second response)
+# - Request 1 gets write lock, connects to origin (slow, 2 second response)
 # - Request 2 fails write lock, enters READ_RETRY mode
 # - Request 2 retries reads, eventually gets read-while-writer access
 # - Both requests should succeed with 200 responses
@@ -31,9 +31,9 @@ meta:
   version: "1.0"
 
 autest:
+  description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer'
   dns:
     name: 'dns-read-retry-basic'
-  description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer'
 
   server:
     name: 'origin-read-retry-basic'
@@ -56,7 +56,6 @@ autest:
       proxy.config.http.cache.max_open_write_retry_timeout: 0
       proxy.config.http.cache.max_open_read_retries: 20
       proxy.config.http.cache.open_read_retry_time: 100
-      # Enable read-while-writer
       proxy.config.cache.enable_read_while_writer: 1
 
     remap_config:
@@ -95,8 +94,8 @@ sessions:
               - [X-Request, {value: 'first', as: equal}]
 
         server-response:
-          # Slow response (3 seconds) to allow second request to retry reads
-          delay: 3s
+          # Slow response (2 seconds) to allow second request to retry reads
+          delay: 2s
           status: 200
           reason: OK
           headers:
@@ -165,7 +164,7 @@ sessions:
 
         # Server should NOT receive this request (should be cache hit)
         server-response:
-          status: 200
+          status: 400
           reason: OK
           headers:
             fields:
diff --git a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml
index 108a5fd3e10..380a880c16a 100644
--- a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml
+++ b/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml
@@ -37,9 +37,9 @@ meta:
   version: "1.0"
 
 autest:
+  description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with slow origin and concurrent requests to verify request collapsing and stability'
   dns:
     name: 'dns-read-retry-exhausted'
-  description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with slow origin and concurrent requests to verify request collapsing and stability'
 
   server:
     name: 'origin-read-retry-exhausted'
@@ -63,7 +63,6 @@ autest:
       # Only 5 read retries @ 100ms = 500ms total (not enough for 10s origin)
       proxy.config.http.cache.max_open_read_retries: 5
       proxy.config.http.cache.open_read_retry_time: 100
-      # Enable read-while-writer
       proxy.config.cache.enable_read_while_writer: 1
 
     remap_config:
@@ -127,7 +126,7 @@ sessions:
   - transactions:
       - client-request:
           # Small delay to ensure first request gets write lock first
-          delay: 50ms
+          delay: 100ms
           method: "GET"
           version: "1.1"
           url: /slowfile
@@ -145,7 +144,7 @@ sessions:
         # This request should NOT reach the server if read-while-writer works
         # It should be served from Request 1's response
         server-response:
-          status: 200
+          status: 400
           reason: OK
           headers:
             fields:
@@ -168,7 +167,7 @@ sessions:
   - transactions:
       - client-request:
           # Arrives after second request starts but before first completes
-          delay: 100ms
+          delay: 300ms
           method: "GET"
           version: "1.1"
           url: /slowfile
@@ -185,7 +184,7 @@ sessions:
 
         # This request will also be served via read-while-writer
         server-response:
-          status: 200
+          status: 400
           reason: OK
           headers:
             fields:
@@ -220,7 +219,7 @@ sessions:
 
         # Server should NOT receive this request (should be cache hit)
         server-response:
-          status: 200
+          status: 400
           reason: OK
           headers:
             fields:

From cd61b1b60ced7ae17d17daee63e67f3fbfd7a298 Mon Sep 17 00:00:00 2001
From: Evan Zelkowitz <e_zelkowitz@apple.com>
Date: Mon, 2 Feb 2026 10:30:23 -0700
Subject: [PATCH 6/6] more comments

---
 src/proxy/http/HttpTransact.cc                |   2 +-
 .../cache/cache-read-retry-mode.test.py       |   3 +-
 .../replay/cache-read-retry-basic.replay.yaml | 179 ------------------
 ...play.yaml => cache-read-retry.replay.yaml} |   2 +-
 4 files changed, 3 insertions(+), 183 deletions(-)
 delete mode 100644 tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
 rename tests/gold_tests/cache/replay/{cache-read-retry-exhausted.replay.yaml => cache-read-retry.replay.yaml} (99%)

diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc
index 5a9a3eba002..4c2e67cd9da 100644
--- a/src/proxy/http/HttpTransact.cc
+++ b/src/proxy/http/HttpTransact.cc
@@ -3354,7 +3354,7 @@ HttpTransact::set_cache_prepare_write_action_for_new_request(State *s)
     // that uses a different cache key (redirect_use_orig_cache_key == 0).
     // When redirect_use_orig_cache_key is enabled, the redirect uses the same cache key
     // as the original request, so we'd hit the same write lock contention.
-    Warning("set_cache_prepare_write_action_for_new_request called with READ_RETRY state");
+    Error("set_cache_prepare_write_action_for_new_request called with READ_RETRY state");
     s->cache_info.action = CacheAction_t::NO_ACTION;
   } else {
     s->cache_info.action           = CacheAction_t::PREPARE_TO_WRITE;
diff --git a/tests/gold_tests/cache/cache-read-retry-mode.test.py b/tests/gold_tests/cache/cache-read-retry-mode.test.py
index 9029c71328f..117a7b7bae8 100644
--- a/tests/gold_tests/cache/cache-read-retry-mode.test.py
+++ b/tests/gold_tests/cache/cache-read-retry-mode.test.py
@@ -27,5 +27,4 @@
 
 Test.ContinueOnFail = True
 
-Test.ATSReplayTest(replay_file="replay/cache-read-retry-basic.replay.yaml")
-Test.ATSReplayTest(replay_file="replay/cache-read-retry-exhausted.replay.yaml")
+Test.ATSReplayTest(replay_file="replay/cache-read-retry.replay.yaml")
diff --git a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
deleted file mode 100644
index f913d2af75c..00000000000
--- a/tests/gold_tests/cache/replay/cache-read-retry-basic.replay.yaml
+++ /dev/null
@@ -1,179 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one
-#  or more contributor license agreements.  See the NOTICE file
-#  distributed with this work for additional information
-#  regarding copyright ownership.  The ASF licenses this file
-#  to you under the Apache License, Version 2.0 (the
-#  "License"); you may not use this file except in compliance
-#  with the License.  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-#
-# This replay file tests cache_open_write_fail_action = 5 (READ_RETRY)
-# with a basic scenario where read-while-writer eventually succeeds.
-#
-# Scenario:
-# - Two concurrent requests arrive for the same uncached URL
-# - Request 1 gets write lock, connects to origin (slow, 2 second response)
-# - Request 2 fails write lock, enters READ_RETRY mode
-# - Request 2 retries reads, eventually gets read-while-writer access
-# - Both requests should succeed with 200 responses
-# - Only ONE origin connection should be made (request collapsing)
-#
-
-meta:
-  version: "1.0"
-
-autest:
-  description: 'Test cache_open_write_fail_action = 5 (READ_RETRY) with successful read-while-writer'
-  dns:
-    name: 'dns-read-retry-basic'
-
-  server:
-    name: 'origin-read-retry-basic'
-
-  client:
-    name: 'client-read-retry-basic'
-
-  ats:
-    name: 'ts-read-retry-basic'
-    process_config:
-      enable_cache: true
-
-    records_config:
-      proxy.config.diags.debug.enabled: 1
-      proxy.config.diags.debug.tags: 'http|cache|http_cache'
-      # Enable READ_RETRY mode
-      proxy.config.http.cache.open_write_fail_action: 5
-      # Configure retry parameters
-      proxy.config.http.cache.max_open_write_retries: 1
-      proxy.config.http.cache.max_open_write_retry_timeout: 0
-      proxy.config.http.cache.max_open_read_retries: 20
-      proxy.config.http.cache.open_read_retry_time: 100
-      proxy.config.cache.enable_read_while_writer: 1
-
-    remap_config:
-      - from: "http://example.com/"
-        to: "http://backend.example.com:{SERVER_HTTP_PORT}/"
-
-    log_validation:
-      traffic_out:
-        # Should NOT contain crash indicators
-        excludes:
-          - expression: "FATAL|ALERT|Emergency|ink_release_assert"
-            description: "Verify ATS does not crash with READ_RETRY mode"
-        # Should contain cache operations (basic validation)
-        contains:
-          - expression: "state_cache_open_write|CACHE_EVENT_OPEN"
-            description: "Verify cache operations occur"
-
-sessions:
-  #############################################################################
-  # First session: Request that gets the write lock (slow origin response)
-  #############################################################################
-  - transactions:
-      - client-request:
-          method: "GET"
-          version: "1.1"
-          url: /bigfile
-          headers:
-            fields:
-              - [uuid, first-request-write-lock]
-              - [Host, example.com]
-              - [X-Request, first]
-
-        proxy-request:
-          headers:
-            fields:
-              - [X-Request, {value: 'first', as: equal}]
-
-        server-response:
-          # Slow response (2 seconds) to allow second request to retry reads
-          delay: 2s
-          status: 200
-          reason: OK
-          headers:
-            fields:
-              - [Content-Length, 100]
-              - [Cache-Control, "max-age=300"]
-              - [X-Response, from-origin]
-
-        proxy-response:
-          status: 200
-          headers:
-            fields:
-              - [X-Response, {value: 'from-origin', as: equal}]
-
-  #############################################################################
-  # Second session: Concurrent request that should use READ_RETRY
-  # This runs in parallel with the first session
-  #############################################################################
-  - transactions:
-      - client-request:
-          # Small delay to ensure first request gets write lock first
-          delay: 100ms
-          method: "GET"
-          version: "1.1"
-          url: /bigfile
-          headers:
-            fields:
-              - [uuid, second-request-read-retry]
-              - [Host, example.com]
-              - [X-Request, second]
-
-        # Server should NOT receive this request (read-while-writer should work)
-        # If server receives it, request collapsing failed
-        server-response:
-          status: 200
-          reason: OK
-          headers:
-            fields:
-              - [Content-Length, 100]
-              - [Cache-Control, "max-age=300"]
-              - [X-Response, duplicate-origin-connection]
-
-        # Proxy should respond with 200, either from read-while-writer or cache
-        proxy-response:
-          status: 200
-          headers:
-            fields:
-              # Should get the response, but preferably from read-while-writer
-              - [X-Response, {value: 'from-origin', as: equal}]
-
-  #############################################################################
-  # Third session: Verify content is cached after both requests complete
-  #############################################################################
-  - transactions:
-      - client-request:
-          # Wait for previous transactions to complete
-          delay: 4s
-          method: "GET"
-          version: "1.1"
-          url: /bigfile
-          headers:
-            fields:
-              - [uuid, third-request-cache-hit]
-              - [Host, example.com]
-              - [X-Request, third]
-
-        # Server should NOT receive this request (should be cache hit)
-        server-response:
-          status: 400
-          reason: OK
-          headers:
-            fields:
-              - [Content-Length, 100]
-              - [X-Response, should-not-see-this]
-
-        # Proxy should respond from cache
-        proxy-response:
-          status: 200
-          headers:
-            fields:
-              - [X-Response, {value: 'from-origin', as: equal}]
diff --git a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml b/tests/gold_tests/cache/replay/cache-read-retry.replay.yaml
similarity index 99%
rename from tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml
rename to tests/gold_tests/cache/replay/cache-read-retry.replay.yaml
index 380a880c16a..49e46e887c4 100644
--- a/tests/gold_tests/cache/replay/cache-read-retry-exhausted.replay.yaml
+++ b/tests/gold_tests/cache/replay/cache-read-retry.replay.yaml
@@ -60,7 +60,7 @@ autest:
       # Configure retry parameters to exhaust quickly
       proxy.config.http.cache.max_open_write_retries: 1
       proxy.config.http.cache.max_open_write_retry_timeout: 0
-      # Only 5 read retries @ 100ms = 500ms total (not enough for 10s origin)
+      # Only 5 read retries @ 100ms = 500ms total (not enough for 3s origin)
       proxy.config.http.cache.max_open_read_retries: 5
       proxy.config.http.cache.open_read_retry_time: 100
       proxy.config.cache.enable_read_while_writer: 1