Skip to content

Commit c1a2239

Browse files
committed
feat(ai): implement parse_data_uri function and integrate it into OpenAI message handling
1 parent e8a1adc commit c1a2239

File tree

5 files changed

+153
-13
lines changed

5 files changed

+153
-13
lines changed

sentry_sdk/ai/utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,39 @@ class GEN_AI_ALLOWED_MESSAGE_ROLES:
4040
GEN_AI_MESSAGE_ROLE_MAPPING[source_role] = target_role
4141

4242

43+
def parse_data_uri(url):
44+
# type: (str) -> Tuple[str, str]
45+
"""
46+
Parse a data URI and return (mime_type, content).
47+
48+
Data URI format (RFC 2397): data:[<mediatype>][;base64],<data>
49+
50+
Examples:
51+
data:image/jpeg;base64,/9j/4AAQ... → ("image/jpeg", "/9j/4AAQ...")
52+
data:text/plain,Hello → ("text/plain", "Hello")
53+
data:;base64,SGVsbG8= → ("", "SGVsbG8=")
54+
55+
Raises:
56+
ValueError: If the URL is not a valid data URI (missing comma separator)
57+
"""
58+
if "," not in url:
59+
raise ValueError("Invalid data URI: missing comma separator")
60+
61+
header, content = url.split(",", 1)
62+
63+
# Extract mime type from header
64+
# Format: "data:<mime>[;param1][;param2]..." e.g. "data:image/jpeg;base64"
65+
# Remove "data:" prefix, then take everything before the first semicolon
66+
if header.startswith("data:"):
67+
mime_part = header[5:] # Remove "data:" prefix
68+
else:
69+
mime_part = header
70+
71+
mime_type = mime_part.split(";")[0]
72+
73+
return mime_type, content
74+
75+
4376
def _normalize_data(data: "Any", unpack: bool = True) -> "Any":
4477
# convert pydantic data (e.g. OpenAI v1+) to json compatible format
4578
if hasattr(data, "model_dump"):

sentry_sdk/integrations/openai.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from sentry_sdk.ai.utils import (
77
set_data_normalized,
88
normalize_message_roles,
9+
parse_data_uri,
910
truncate_and_annotate_messages,
1011
)
1112
from sentry_sdk.consts import SPANDATA
@@ -218,21 +219,33 @@ def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str,
218219
def _map_item(item: "Dict[str, Any]") -> "Dict[str, Any]":
219220
if item.get("type") == "image_url":
220221
image_url = item.get("image_url") or {}
221-
if image_url.get("url", "").startswith("data:"):
222-
return {
223-
"type": "blob",
224-
"modality": "image",
225-
"mime_type": item["image_url"]["url"].split(";base64,")[0],
226-
"content": item["image_url"]["url"].split(";base64,")[1],
227-
}
222+
url = image_url.get("url", "")
223+
if url.startswith("data:"):
224+
try:
225+
mime_type, content = parse_data_uri(url)
226+
return {
227+
"type": "blob",
228+
"modality": "image",
229+
"mime_type": mime_type,
230+
"content": content,
231+
}
232+
except ValueError:
233+
# If parsing fails, return as URI
234+
return {
235+
"type": "uri",
236+
"modality": "image",
237+
"uri": url,
238+
}
228239
else:
229240
return {
230241
"type": "uri",
231-
"uri": item["image_url"]["url"],
242+
"uri": url,
232243
}
233244
return item
234245

235246
for message in messages:
247+
if not isinstance(message, dict):
248+
continue
236249
content = message.get("content")
237250
if isinstance(content, list):
238251
message["content"] = [_map_item(item) for item in content]

sentry_sdk/integrations/openai_agents/utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from sentry_sdk.ai.utils import (
33
GEN_AI_ALLOWED_MESSAGE_ROLES,
44
normalize_message_roles,
5+
parse_data_uri,
56
set_data_normalized,
67
normalize_message_role,
78
truncate_and_annotate_messages,
@@ -66,17 +67,15 @@ def _transform_openai_agents_content_part(
6667
url = content_part.get("image_url", "")
6768

6869
if url.startswith("data:"):
69-
# Parse data URI: data:image/jpeg;base64,/9j/4AAQ...
7070
try:
71-
header, content = url.split(",", 1)
72-
mime_type = header.split(":")[1].split(";")[0] if ":" in header else ""
71+
mime_type, content = parse_data_uri(url)
7372
return {
7473
"type": "blob",
7574
"modality": "image",
7675
"mime_type": mime_type,
7776
"content": content,
7877
}
79-
except (ValueError, IndexError):
78+
except ValueError:
8079
# If parsing fails, return as URI
8180
return {
8281
"type": "uri",

tests/integrations/openai/test_openai.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1548,7 +1548,7 @@ def test_convert_message_parts_image_url_to_blob():
15481548
blob_item = converted[0]["content"][1]
15491549
assert blob_item["type"] == "blob"
15501550
assert blob_item["modality"] == "image"
1551-
assert blob_item["mime_type"] == "data:image/jpeg"
1551+
assert blob_item["mime_type"] == "image/jpeg"
15521552
assert blob_item["content"] == "/9j/4AAQSkZJRg=="
15531553
# Verify the original image_url structure is replaced
15541554
assert "image_url" not in blob_item
@@ -1581,6 +1581,34 @@ def test_convert_message_parts_image_url_to_uri():
15811581
assert "image_url" not in uri_item
15821582

15831583

1584+
def test_convert_message_parts_malformed_data_uri():
1585+
"""Test that malformed data URIs are handled gracefully without crashing"""
1586+
messages = [
1587+
{
1588+
"role": "user",
1589+
"content": [
1590+
{
1591+
"type": "image_url",
1592+
"image_url": {
1593+
# Malformed: missing ;base64, and comma separator
1594+
"url": "data:image/jpeg",
1595+
},
1596+
},
1597+
],
1598+
}
1599+
]
1600+
1601+
# Should not raise an exception
1602+
converted = _convert_message_parts(messages)
1603+
1604+
assert len(converted) == 1
1605+
# Malformed data URI should fall back to uri type
1606+
item = converted[0]["content"][0]
1607+
assert item["type"] == "uri"
1608+
assert item["uri"] == "data:image/jpeg"
1609+
assert item["modality"] == "image"
1610+
1611+
15841612
def test_openai_message_truncation(sentry_init, capture_events):
15851613
"""Test that large messages are truncated properly in OpenAI integration."""
15861614
sentry_init(

tests/test_ai_monitoring.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
truncate_and_annotate_messages,
1414
truncate_messages_by_size,
1515
_find_truncation_index,
16+
parse_data_uri,
1617
redact_blob_message_parts,
1718
)
1819
from sentry_sdk.serializer import serialize
@@ -646,3 +647,69 @@ def test_redacts_blobs_in_multiple_messages(self):
646647
assert messages[0]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE
647648
assert messages[1]["content"] == "I see the image." # Unchanged
648649
assert messages[2]["content"][1]["content"] == SENSITIVE_DATA_SUBSTITUTE
650+
651+
652+
class TestParseDataUri:
653+
"""Tests for the parse_data_uri utility function."""
654+
655+
def test_standard_base64_image(self):
656+
"""Test parsing a standard base64 encoded image data URI."""
657+
url = "data:image/jpeg;base64,/9j/4AAQSkZJRg=="
658+
mime_type, content = parse_data_uri(url)
659+
assert mime_type == "image/jpeg"
660+
assert content == "/9j/4AAQSkZJRg=="
661+
662+
def test_png_image(self):
663+
"""Test parsing a PNG image data URI."""
664+
url = "data:image/png;base64,iVBORw0KGgo="
665+
mime_type, content = parse_data_uri(url)
666+
assert mime_type == "image/png"
667+
assert content == "iVBORw0KGgo="
668+
669+
def test_plain_text_without_base64(self):
670+
"""Test parsing a plain text data URI without base64 encoding."""
671+
url = "data:text/plain,Hello%20World"
672+
mime_type, content = parse_data_uri(url)
673+
assert mime_type == "text/plain"
674+
assert content == "Hello%20World"
675+
676+
def test_no_mime_type_with_base64(self):
677+
"""Test parsing a data URI with no mime type but base64 encoding."""
678+
url = "data:;base64,SGVsbG8="
679+
mime_type, content = parse_data_uri(url)
680+
assert mime_type == ""
681+
assert content == "SGVsbG8="
682+
683+
def test_no_mime_type_no_base64(self):
684+
"""Test parsing a minimal data URI."""
685+
url = "data:,Hello"
686+
mime_type, content = parse_data_uri(url)
687+
assert mime_type == ""
688+
assert content == "Hello"
689+
690+
def test_content_with_commas(self):
691+
"""Test that content with commas is handled correctly."""
692+
url = "data:text/csv,a,b,c,d"
693+
mime_type, content = parse_data_uri(url)
694+
assert mime_type == "text/csv"
695+
assert content == "a,b,c,d"
696+
697+
def test_missing_comma_raises_value_error(self):
698+
"""Test that a data URI without a comma raises ValueError."""
699+
url = "data:image/jpeg"
700+
with pytest.raises(ValueError, match="missing comma separator"):
701+
parse_data_uri(url)
702+
703+
def test_empty_content(self):
704+
"""Test parsing a data URI with empty content."""
705+
url = "data:text/plain,"
706+
mime_type, content = parse_data_uri(url)
707+
assert mime_type == "text/plain"
708+
assert content == ""
709+
710+
def test_mime_type_with_charset(self):
711+
"""Test parsing a data URI with charset parameter."""
712+
url = "data:text/html;charset=utf-8,<h1>Hello</h1>"
713+
mime_type, content = parse_data_uri(url)
714+
assert mime_type == "text/html"
715+
assert content == "<h1>Hello</h1>"

0 commit comments

Comments
 (0)