Skip to content

Commit 121c9d2

Browse files
feat(ai): add parse_data_uri function to parse a data URI (#5311)
Add a central function to parse data URIs to be used in AI integrations where Blob data is usually sent as data URIs
1 parent 93855e7 commit 121c9d2

File tree

2 files changed

+117
-0
lines changed

2 files changed

+117
-0
lines changed

sentry_sdk/ai/utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,38 @@ class GEN_AI_ALLOWED_MESSAGE_ROLES:
4040
GEN_AI_MESSAGE_ROLE_MAPPING[source_role] = target_role
4141

4242

43+
def parse_data_uri(url: str) -> "Tuple[str, str]":
44+
"""
45+
Parse a data URI and return (mime_type, content).
46+
47+
Data URI format (RFC 2397): data:[<mediatype>][;base64],<data>
48+
49+
Examples:
50+
... → ("image/jpeg", "/9j/4AAQ...")
51+
data:text/plain,Hello → ("text/plain", "Hello")
52+
data:;base64,SGVsbG8= → ("", "SGVsbG8=")
53+
54+
Raises:
55+
ValueError: If the URL is not a valid data URI (missing comma separator)
56+
"""
57+
if "," not in url:
58+
raise ValueError("Invalid data URI: missing comma separator")
59+
60+
header, content = url.split(",", 1)
61+
62+
# Extract mime type from header
63+
# Format: "data:<mime>[;param1][;param2]..." e.g. "data:image/jpeg;base64"
64+
# Remove "data:" prefix, then take everything before the first semicolon
65+
if header.startswith("data:"):
66+
mime_part = header[5:] # Remove "data:" prefix
67+
else:
68+
mime_part = header
69+
70+
mime_type = mime_part.split(";")[0]
71+
72+
return mime_type, content
73+
74+
4375
def _normalize_data(data: "Any", unpack: bool = True) -> "Any":
4476
# convert pydantic data (e.g. OpenAI v1+) to json compatible format
4577
if hasattr(data, "model_dump"):

tests/test_ai_monitoring.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
truncate_and_annotate_messages,
1818
truncate_messages_by_size,
1919
_find_truncation_index,
20+
parse_data_uri,
2021
redact_blob_message_parts,
2122
)
2223
from sentry_sdk.serializer import serialize
@@ -757,3 +758,87 @@ def test_handles_non_dict_content_items(self):
757758

758759
# Should return same list since no blobs
759760
assert result is messages
761+
762+
763+
class TestParseDataUri:
764+
def test_parses_base64_image_data_uri(self):
765+
"""Test parsing a standard base64-encoded image data URI"""
766+
uri = ""
767+
mime_type, content = parse_data_uri(uri)
768+
769+
assert mime_type == "image/jpeg"
770+
assert content == "/9j/4AAQSkZJRg=="
771+
772+
def test_parses_png_data_uri(self):
773+
"""Test parsing a PNG image data URI"""
774+
uri = ""
775+
mime_type, content = parse_data_uri(uri)
776+
777+
assert mime_type == "image/png"
778+
assert content == "iVBORw0KGgo="
779+
780+
def test_parses_plain_text_data_uri(self):
781+
"""Test parsing a plain text data URI without base64 encoding"""
782+
uri = "data:text/plain,Hello World"
783+
mime_type, content = parse_data_uri(uri)
784+
785+
assert mime_type == "text/plain"
786+
assert content == "Hello World"
787+
788+
def test_parses_data_uri_with_empty_mime_type(self):
789+
"""Test parsing a data URI with empty mime type"""
790+
uri = "data:;base64,SGVsbG8="
791+
mime_type, content = parse_data_uri(uri)
792+
793+
assert mime_type == ""
794+
assert content == "SGVsbG8="
795+
796+
def test_parses_data_uri_with_only_data_prefix(self):
797+
"""Test parsing a data URI with only the data: prefix and content"""
798+
uri = "data:,Hello"
799+
mime_type, content = parse_data_uri(uri)
800+
801+
assert mime_type == ""
802+
assert content == "Hello"
803+
804+
def test_raises_on_missing_comma(self):
805+
"""Test that ValueError is raised when comma separator is missing"""
806+
with pytest.raises(ValueError, match="missing comma separator"):
807+
parse_data_uri("data:image/jpeg;base64")
808+
809+
def test_raises_on_empty_string(self):
810+
"""Test that ValueError is raised for empty string"""
811+
with pytest.raises(ValueError, match="missing comma separator"):
812+
parse_data_uri("")
813+
814+
def test_handles_content_with_commas(self):
815+
"""Test that only the first comma is used as separator"""
816+
uri = "data:text/plain,Hello,World,With,Commas"
817+
mime_type, content = parse_data_uri(uri)
818+
819+
assert mime_type == "text/plain"
820+
assert content == "Hello,World,With,Commas"
821+
822+
def test_parses_data_uri_with_multiple_parameters(self):
823+
"""Test parsing a data URI with multiple parameters in header"""
824+
uri = "data:text/plain;charset=utf-8;base64,SGVsbG8="
825+
mime_type, content = parse_data_uri(uri)
826+
827+
assert mime_type == "text/plain"
828+
assert content == "SGVsbG8="
829+
830+
def test_parses_audio_data_uri(self):
831+
"""Test parsing an audio data URI"""
832+
uri = "data:audio/wav;base64,UklGRiQA"
833+
mime_type, content = parse_data_uri(uri)
834+
835+
assert mime_type == "audio/wav"
836+
assert content == "UklGRiQA"
837+
838+
def test_handles_uri_without_data_prefix(self):
839+
"""Test parsing a URI that doesn't have the data: prefix"""
840+
uri = "image/jpeg;base64,/9j/4AAQ"
841+
mime_type, content = parse_data_uri(uri)
842+
843+
assert mime_type == "image/jpeg"
844+
assert content == "/9j/4AAQ"

0 commit comments

Comments
 (0)