Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 53 additions & 4 deletions src/external/internet_archives/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Client for the Internet Archive CDX and Save APIs."""
from asyncio import sleep
from asyncio import Semaphore

from aiolimiter import AsyncLimiter
Expand Down Expand Up @@ -123,25 +124,73 @@ async def search_for_url_snapshot(self: "InternetArchivesClient", url: str) -> I
error=None
)

async def _save_url(self: "InternetArchivesClient", url: str) -> int:
async def _save_url(self: "InternetArchivesClient", url: str) -> str:
async with self.session.post(
"http://web.archive.org/save",
"https://web.archive.org/save",
data={
"url": url,
"skip_first_archive": 1
},
headers={
"Authorization": f"LOW {self.s3_keys}",
"Accept": "application/json",
"Capture-Output": "json",
}
) as response:
response.raise_for_status()
payload: dict = await response.json(content_type=None)
job_id = payload.get("job_id")
if not job_id:
raise ValueError(
f"Save request for URL {url} succeeded without job_id: {payload}"
)
return str(job_id)

async def _get_save_status(self: "InternetArchivesClient", job_id: str) -> dict:
async with self.session.get(
f"https://web.archive.org/save/status/{job_id}",
headers={
"Authorization": f"LOW {self.s3_keys}",
"Accept": "application/json"
}
) as response:
response.raise_for_status()
return response.status
return await response.json(content_type=None)

async def _wait_for_save_completion(
self: "InternetArchivesClient",
job_id: str,
max_attempts: int = 10,
poll_interval_seconds: float = 2.0
) -> str | None:
for _ in range(max_attempts):
payload = await self._get_save_status(job_id)
raw_status = payload.get("status")
status = str(raw_status).lower() if raw_status is not None else ""

if status == "success":
return None

if status in {"error", "failed", "failure"}:
status_message = payload.get("message")
if status_message:
return str(status_message)
return f"Save request failed with payload: {payload}"

await sleep(poll_interval_seconds)

return f"Timed out waiting for Internet Archive save completion for job_id={job_id}"

async def save_to_internet_archives(self: "InternetArchivesClient", url: str) -> InternetArchivesSaveResponseInfo:
"""Save a URL to the Internet Archive."""
try:
_: int = await self._save_url(url)
job_id: str = await self._save_url(url)
save_error: str | None = await self._wait_for_save_completion(job_id)
if save_error:
return InternetArchivesSaveResponseInfo(
url=url,
error=save_error
)
except Exception as e:
return InternetArchivesSaveResponseInfo(
url=url,
Expand Down
124 changes: 124 additions & 0 deletions tests/automated/unit/external/internet_archives/test_client_save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""Unit tests for Internet Archive save lifecycle."""
from unittest import mock

import pytest
from aiohttp import ClientSession

from src.external.internet_archives.client import InternetArchivesClient

TEST_URL = "https://example.gov/path"


@pytest.fixture
def mock_env() -> mock.MagicMock:
"""Patch environment access for S3 key loading."""
mock_path = "src.external.internet_archives.client.Env"
with mock.patch(mock_path) as mock_env_cls:
mock_env_instance = mock.MagicMock()
mock_env_instance.str.return_value = "fake_s3_keys"
mock_env_cls.return_value = mock_env_instance
yield mock_env_cls


@pytest.mark.asyncio
@pytest.mark.usefixtures("mock_env")
async def test_save_success_when_status_completes() -> None:

Check warning on line 25 in tests/automated/unit/external/internet_archives/test_client_save.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/automated/unit/external/internet_archives/test_client_save.py#L25 <103>

Missing docstring in public function
Raw output
./tests/automated/unit/external/internet_archives/test_client_save.py:25:1: D103 Missing docstring in public function
session = mock.MagicMock(spec=ClientSession)
client = InternetArchivesClient(session)

with (
mock.patch.object(client, "_save_url", new=mock.AsyncMock(return_value="job_1")) as mock_save_url,
mock.patch.object(
client,
"_wait_for_save_completion",
new=mock.AsyncMock(return_value=None)
) as mock_wait
):
response = await client.save_to_internet_archives(TEST_URL)

mock_save_url.assert_awaited_once_with(TEST_URL)
mock_wait.assert_awaited_once_with("job_1")
assert response.url == TEST_URL
assert response.error is None


@pytest.mark.asyncio
@pytest.mark.usefixtures("mock_env")
async def test_save_error_when_status_fails() -> None:

Check warning on line 47 in tests/automated/unit/external/internet_archives/test_client_save.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/automated/unit/external/internet_archives/test_client_save.py#L47 <103>

Missing docstring in public function
Raw output
./tests/automated/unit/external/internet_archives/test_client_save.py:47:1: D103 Missing docstring in public function
session = mock.MagicMock(spec=ClientSession)
client = InternetArchivesClient(session)

with (
mock.patch.object(client, "_save_url", new=mock.AsyncMock(return_value="job_1")),
mock.patch.object(
client,
"_wait_for_save_completion",
new=mock.AsyncMock(return_value="Failed to capture URL")
)
):
response = await client.save_to_internet_archives(TEST_URL)

assert response.url == TEST_URL
assert response.error == "Failed to capture URL"


@pytest.mark.asyncio
@pytest.mark.usefixtures("mock_env")
async def test_save_error_when_submit_raises() -> None:

Check warning on line 67 in tests/automated/unit/external/internet_archives/test_client_save.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/automated/unit/external/internet_archives/test_client_save.py#L67 <103>

Missing docstring in public function
Raw output
./tests/automated/unit/external/internet_archives/test_client_save.py:67:1: D103 Missing docstring in public function
session = mock.MagicMock(spec=ClientSession)
client = InternetArchivesClient(session)

with mock.patch.object(
client, "_save_url", new=mock.AsyncMock(side_effect=RuntimeError("boom"))
):
response = await client.save_to_internet_archives(TEST_URL)

assert response.url == TEST_URL
assert response.error == "RuntimeError: boom"


@pytest.mark.asyncio
@pytest.mark.usefixtures("mock_env")
async def test_wait_for_save_completion_retries_until_success() -> None:

Check warning on line 82 in tests/automated/unit/external/internet_archives/test_client_save.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/automated/unit/external/internet_archives/test_client_save.py#L82 <103>

Missing docstring in public function
Raw output
./tests/automated/unit/external/internet_archives/test_client_save.py:82:1: D103 Missing docstring in public function
session = mock.MagicMock(spec=ClientSession)
client = InternetArchivesClient(session)

mock_status = mock.AsyncMock(
side_effect=[
{"status": "pending"},
{"status": "success"},
]
)

with mock.patch.object(client, "_get_save_status", new=mock_status):
error = await client._wait_for_save_completion(
"job_1",
max_attempts=2,
poll_interval_seconds=0
)

assert error is None
assert mock_status.await_count == 2


@pytest.mark.asyncio
@pytest.mark.usefixtures("mock_env")
async def test_wait_for_save_completion_returns_failure_message() -> None:

Check warning on line 106 in tests/automated/unit/external/internet_archives/test_client_save.py

View workflow job for this annotation

GitHub Actions / flake8

[flake8] tests/automated/unit/external/internet_archives/test_client_save.py#L106 <103>

Missing docstring in public function
Raw output
./tests/automated/unit/external/internet_archives/test_client_save.py:106:1: D103 Missing docstring in public function
session = mock.MagicMock(spec=ClientSession)
client = InternetArchivesClient(session)

mock_status = mock.AsyncMock(
return_value={
"status": "failed",
"message": "robots.txt blocked capture"
}
)

with mock.patch.object(client, "_get_save_status", new=mock_status):
error = await client._wait_for_save_completion(
"job_1",
max_attempts=1,
poll_interval_seconds=0
)

assert error == "robots.txt blocked capture"