From a45096455a542fe7d56ed38a0b0ccf32887993ab Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 26 Feb 2026 19:49:54 -0500 Subject: [PATCH 1/2] fix(ia-save): verify async save status before marking success --- src/external/internet_archives/client.py | 56 +++++++- .../internet_archives/test_client_save.py | 124 ++++++++++++++++++ 2 files changed, 175 insertions(+), 5 deletions(-) create mode 100644 tests/automated/unit/external/internet_archives/test_client_save.py diff --git a/src/external/internet_archives/client.py b/src/external/internet_archives/client.py index f66b01e9..72af6914 100644 --- a/src/external/internet_archives/client.py +++ b/src/external/internet_archives/client.py @@ -1,4 +1,5 @@ """Client for the Internet Archive CDX and Save APIs.""" +from asyncio import sleep from asyncio import Semaphore from aiolimiter import AsyncLimiter @@ -123,25 +124,70 @@ async def search_for_url_snapshot(self: "InternetArchivesClient", url: str) -> I error=None ) - async def _save_url(self: "InternetArchivesClient", url: str) -> int: + async def _save_url(self: "InternetArchivesClient", url: str) -> str: async with self.session.post( - "http://web.archive.org/save", + "https://web.archive.org/save", data={ "url": url, "skip_first_archive": 1 }, headers={ "Authorization": f"LOW {self.s3_keys}", - "Accept": "application/json" + "Accept": "application/json", + "Capture-Output": "json", } ) as response: response.raise_for_status() - return response.status + payload: dict = await response.json(content_type=None) + job_id = payload.get("job_id") + if not job_id: + raise ValueError( + f"Save request for URL {url} succeeded without job_id: {payload}" + ) + return str(job_id) + + async def _get_save_status(self: "InternetArchivesClient", job_id: str) -> dict: + async with self.session.get( + f"https://web.archive.org/save/status/{job_id}", + headers={"Accept": "application/json"} + ) as response: + response.raise_for_status() + return await response.json(content_type=None) + + async def _wait_for_save_completion( + self: "InternetArchivesClient", + job_id: str, + max_attempts: int = 10, + poll_interval_seconds: float = 2.0 + ) -> str | None: + for _ in range(max_attempts): + payload = await self._get_save_status(job_id) + raw_status = payload.get("status") + status = str(raw_status).lower() if raw_status is not None else "" + + if status == "success": + return None + + if status in {"error", "failed", "failure"}: + status_message = payload.get("message") + if status_message: + return str(status_message) + return f"Save request failed with payload: {payload}" + + await sleep(poll_interval_seconds) + + return f"Timed out waiting for Internet Archive save completion for job_id={job_id}" async def save_to_internet_archives(self: "InternetArchivesClient", url: str) -> InternetArchivesSaveResponseInfo: """Save a URL to the Internet Archive.""" try: - _: int = await self._save_url(url) + job_id: str = await self._save_url(url) + save_error: str | None = await self._wait_for_save_completion(job_id) + if save_error: + return InternetArchivesSaveResponseInfo( + url=url, + error=save_error + ) except Exception as e: return InternetArchivesSaveResponseInfo( url=url, diff --git a/tests/automated/unit/external/internet_archives/test_client_save.py b/tests/automated/unit/external/internet_archives/test_client_save.py new file mode 100644 index 00000000..7e0389f7 --- /dev/null +++ b/tests/automated/unit/external/internet_archives/test_client_save.py @@ -0,0 +1,124 @@ +"""Unit tests for Internet Archive save lifecycle.""" +from unittest import mock + +import pytest +from aiohttp import ClientSession + +from src.external.internet_archives.client import InternetArchivesClient + +TEST_URL = "https://example.gov/path" + + +@pytest.fixture +def mock_env() -> mock.MagicMock: + """Patch environment access for S3 key loading.""" + mock_path = "src.external.internet_archives.client.Env" + with mock.patch(mock_path) as mock_env_cls: + mock_env_instance = mock.MagicMock() + mock_env_instance.str.return_value = "fake_s3_keys" + mock_env_cls.return_value = mock_env_instance + yield mock_env_cls + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("mock_env") +async def test_save_success_when_status_completes() -> None: + session = mock.MagicMock(spec=ClientSession) + client = InternetArchivesClient(session) + + with ( + mock.patch.object(client, "_save_url", new=mock.AsyncMock(return_value="job_1")) as mock_save_url, + mock.patch.object( + client, + "_wait_for_save_completion", + new=mock.AsyncMock(return_value=None) + ) as mock_wait + ): + response = await client.save_to_internet_archives(TEST_URL) + + mock_save_url.assert_awaited_once_with(TEST_URL) + mock_wait.assert_awaited_once_with("job_1") + assert response.url == TEST_URL + assert response.error is None + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("mock_env") +async def test_save_error_when_status_fails() -> None: + session = mock.MagicMock(spec=ClientSession) + client = InternetArchivesClient(session) + + with ( + mock.patch.object(client, "_save_url", new=mock.AsyncMock(return_value="job_1")), + mock.patch.object( + client, + "_wait_for_save_completion", + new=mock.AsyncMock(return_value="Failed to capture URL") + ) + ): + response = await client.save_to_internet_archives(TEST_URL) + + assert response.url == TEST_URL + assert response.error == "Failed to capture URL" + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("mock_env") +async def test_save_error_when_submit_raises() -> None: + session = mock.MagicMock(spec=ClientSession) + client = InternetArchivesClient(session) + + with mock.patch.object( + client, "_save_url", new=mock.AsyncMock(side_effect=RuntimeError("boom")) + ): + response = await client.save_to_internet_archives(TEST_URL) + + assert response.url == TEST_URL + assert response.error == "RuntimeError: boom" + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("mock_env") +async def test_wait_for_save_completion_retries_until_success() -> None: + session = mock.MagicMock(spec=ClientSession) + client = InternetArchivesClient(session) + + mock_status = mock.AsyncMock( + side_effect=[ + {"status": "pending"}, + {"status": "success"}, + ] + ) + + with mock.patch.object(client, "_get_save_status", new=mock_status): + error = await client._wait_for_save_completion( + "job_1", + max_attempts=2, + poll_interval_seconds=0 + ) + + assert error is None + assert mock_status.await_count == 2 + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("mock_env") +async def test_wait_for_save_completion_returns_failure_message() -> None: + session = mock.MagicMock(spec=ClientSession) + client = InternetArchivesClient(session) + + mock_status = mock.AsyncMock( + return_value={ + "status": "failed", + "message": "robots.txt blocked capture" + } + ) + + with mock.patch.object(client, "_get_save_status", new=mock_status): + error = await client._wait_for_save_completion( + "job_1", + max_attempts=1, + poll_interval_seconds=0 + ) + + assert error == "robots.txt blocked capture" From 180365b4821f202b4c9bba961aaa5ba87601a206 Mon Sep 17 00:00:00 2001 From: Max Chis Date: Thu, 26 Feb 2026 19:57:12 -0500 Subject: [PATCH 2/2] fix(ia-save): authorize status polling requests --- src/external/internet_archives/client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/external/internet_archives/client.py b/src/external/internet_archives/client.py index 72af6914..9d180e96 100644 --- a/src/external/internet_archives/client.py +++ b/src/external/internet_archives/client.py @@ -149,7 +149,10 @@ async def _save_url(self: "InternetArchivesClient", url: str) -> str: async def _get_save_status(self: "InternetArchivesClient", job_id: str) -> dict: async with self.session.get( f"https://web.archive.org/save/status/{job_id}", - headers={"Accept": "application/json"} + headers={ + "Authorization": f"LOW {self.s3_keys}", + "Accept": "application/json" + } ) as response: response.raise_for_status() return await response.json(content_type=None)