From 9f9c9ce9ccd8800e4fef1bafc33229b2ae70831f Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Wed, 14 Jan 2026 14:48:46 +0100 Subject: [PATCH] feat(ai): add parse_data_uri function to parse a data URI --- sentry_sdk/ai/utils.py | 32 ++++++++++++++ tests/test_ai_monitoring.py | 85 +++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index dd04473b6a..71f7544a1c 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -40,6 +40,38 @@ class GEN_AI_ALLOWED_MESSAGE_ROLES: GEN_AI_MESSAGE_ROLE_MAPPING[source_role] = target_role +def parse_data_uri(url: str) -> "Tuple[str, str]": + """ + Parse a data URI and return (mime_type, content). + + Data URI format (RFC 2397): data:[][;base64], + + Examples: + data:image/jpeg;base64,/9j/4AAQ... → ("image/jpeg", "/9j/4AAQ...") + data:text/plain,Hello → ("text/plain", "Hello") + data:;base64,SGVsbG8= → ("", "SGVsbG8=") + + Raises: + ValueError: If the URL is not a valid data URI (missing comma separator) + """ + if "," not in url: + raise ValueError("Invalid data URI: missing comma separator") + + header, content = url.split(",", 1) + + # Extract mime type from header + # Format: "data:[;param1][;param2]..." e.g. "data:image/jpeg;base64" + # Remove "data:" prefix, then take everything before the first semicolon + if header.startswith("data:"): + mime_part = header[5:] # Remove "data:" prefix + else: + mime_part = header + + mime_type = mime_part.split(";")[0] + + return mime_type, content + + def _normalize_data(data: "Any", unpack: bool = True) -> "Any": # convert pydantic data (e.g. OpenAI v1+) to json compatible format if hasattr(data, "model_dump"): diff --git a/tests/test_ai_monitoring.py b/tests/test_ai_monitoring.py index b11ca9710d..1ff354f473 100644 --- a/tests/test_ai_monitoring.py +++ b/tests/test_ai_monitoring.py @@ -17,6 +17,7 @@ truncate_and_annotate_messages, truncate_messages_by_size, _find_truncation_index, + parse_data_uri, redact_blob_message_parts, ) from sentry_sdk.serializer import serialize @@ -757,3 +758,87 @@ def test_handles_non_dict_content_items(self): # Should return same list since no blobs assert result is messages + + +class TestParseDataUri: + def test_parses_base64_image_data_uri(self): + """Test parsing a standard base64-encoded image data URI""" + uri = "data:image/jpeg;base64,/9j/4AAQSkZJRg==" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "image/jpeg" + assert content == "/9j/4AAQSkZJRg==" + + def test_parses_png_data_uri(self): + """Test parsing a PNG image data URI""" + uri = "data:image/png;base64,iVBORw0KGgo=" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "image/png" + assert content == "iVBORw0KGgo=" + + def test_parses_plain_text_data_uri(self): + """Test parsing a plain text data URI without base64 encoding""" + uri = "data:text/plain,Hello World" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "text/plain" + assert content == "Hello World" + + def test_parses_data_uri_with_empty_mime_type(self): + """Test parsing a data URI with empty mime type""" + uri = "data:;base64,SGVsbG8=" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "" + assert content == "SGVsbG8=" + + def test_parses_data_uri_with_only_data_prefix(self): + """Test parsing a data URI with only the data: prefix and content""" + uri = "data:,Hello" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "" + assert content == "Hello" + + def test_raises_on_missing_comma(self): + """Test that ValueError is raised when comma separator is missing""" + with pytest.raises(ValueError, match="missing comma separator"): + parse_data_uri("data:image/jpeg;base64") + + def test_raises_on_empty_string(self): + """Test that ValueError is raised for empty string""" + with pytest.raises(ValueError, match="missing comma separator"): + parse_data_uri("") + + def test_handles_content_with_commas(self): + """Test that only the first comma is used as separator""" + uri = "data:text/plain,Hello,World,With,Commas" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "text/plain" + assert content == "Hello,World,With,Commas" + + def test_parses_data_uri_with_multiple_parameters(self): + """Test parsing a data URI with multiple parameters in header""" + uri = "data:text/plain;charset=utf-8;base64,SGVsbG8=" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "text/plain" + assert content == "SGVsbG8=" + + def test_parses_audio_data_uri(self): + """Test parsing an audio data URI""" + uri = "data:audio/wav;base64,UklGRiQA" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "audio/wav" + assert content == "UklGRiQA" + + def test_handles_uri_without_data_prefix(self): + """Test parsing a URI that doesn't have the data: prefix""" + uri = "image/jpeg;base64,/9j/4AAQ" + mime_type, content = parse_data_uri(uri) + + assert mime_type == "image/jpeg" + assert content == "/9j/4AAQ"